aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/virtual/kvm/api.txt136
-rw-r--r--MAINTAINERS1
-rw-r--r--arch/arm/include/asm/kvm_asm.h4
-rw-r--r--arch/arm/include/asm/kvm_host.h7
-rw-r--r--arch/arm/include/asm/kvm_mmu.h61
-rw-r--r--arch/arm/include/asm/stage2_pgtable.h8
-rw-r--r--arch/arm/kvm/coproc.c4
-rw-r--r--arch/arm64/include/asm/kvm_arm.h6
-rw-r--r--arch/arm64/include/asm/kvm_asm.h7
-rw-r--r--arch/arm64/include/asm/kvm_emulate.h35
-rw-r--r--arch/arm64/include/asm/kvm_host.h5
-rw-r--r--arch/arm64/include/asm/kvm_mmu.h48
-rw-r--r--arch/arm64/include/asm/pgtable-hwdef.h4
-rw-r--r--arch/arm64/include/asm/pgtable.h9
-rw-r--r--arch/arm64/include/asm/stage2_pgtable.h16
-rw-r--r--arch/arm64/kvm/debug.c21
-rw-r--r--arch/arm64/kvm/handle_exit.c14
-rw-r--r--arch/arm64/kvm/hyp/switch.c43
-rw-r--r--arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c12
-rw-r--r--arch/arm64/kvm/sys_regs.c12
-rw-r--r--arch/arm64/kvm/sys_regs.h4
-rw-r--r--arch/arm64/kvm/trace.h35
-rw-r--r--arch/mips/include/asm/kvm_host.h2
-rw-r--r--arch/mips/kvm/mips.c29
-rw-r--r--arch/mips/kvm/mmu.c3
-rw-r--r--arch/powerpc/include/asm/hvcall.h1
-rw-r--r--arch/powerpc/include/asm/kvm_book3s.h23
-rw-r--r--arch/powerpc/include/asm/kvm_book3s_64.h18
-rw-r--r--arch/powerpc/include/asm/kvm_host.h5
-rw-r--r--arch/powerpc/include/asm/kvm_ppc.h10
-rw-r--r--arch/powerpc/kernel/exceptions-64s.S9
-rw-r--r--arch/powerpc/kvm/book3s.c8
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu_hv.c12
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu_radix.c160
-rw-r--r--arch/powerpc/kvm/book3s_hv.c95
-rw-r--r--arch/powerpc/kvm/book3s_hv_nested.c190
-rw-r--r--arch/powerpc/kvm/book3s_hv_rm_mmu.c2
-rw-r--r--arch/powerpc/kvm/book3s_pr.c4
-rw-r--r--arch/powerpc/kvm/book3s_xics.c12
-rw-r--r--arch/powerpc/kvm/book3s_xive.c12
-rw-r--r--arch/powerpc/kvm/booke.c3
-rw-r--r--arch/powerpc/kvm/e500_mmu_host.c3
-rw-r--r--arch/powerpc/kvm/powerpc.c47
-rw-r--r--arch/powerpc/mm/fault.c1
-rw-r--r--arch/s390/kvm/kvm-s390.c35
-rw-r--r--arch/x86/events/intel/pt.c60
-rw-r--r--arch/x86/events/intel/pt.h58
-rw-r--r--arch/x86/hyperv/nested.c80
-rw-r--r--arch/x86/include/asm/cpufeatures.h1
-rw-r--r--arch/x86/include/asm/hyperv-tlfs.h335
-rw-r--r--arch/x86/include/asm/intel_pt.h26
-rw-r--r--arch/x86/include/asm/kvm_host.h25
-rw-r--r--arch/x86/include/asm/mshyperv.h15
-rw-r--r--arch/x86/include/asm/msr-index.h37
-rw-r--r--arch/x86/include/asm/svm.h7
-rw-r--r--arch/x86/include/asm/trace/hyperv.h14
-rw-r--r--arch/x86/include/asm/vmx.h9
-rw-r--r--arch/x86/kernel/kvmclock.c15
-rw-r--r--arch/x86/kvm/Makefile2
-rw-r--r--arch/x86/kvm/cpuid.c31
-rw-r--r--arch/x86/kvm/hyperv.c305
-rw-r--r--arch/x86/kvm/hyperv.h4
-rw-r--r--arch/x86/kvm/kvm_cache_regs.h2
-rw-r--r--arch/x86/kvm/lapic.c5
-rw-r--r--arch/x86/kvm/mmu.c98
-rw-r--r--arch/x86/kvm/paging_tmpl.h3
-rw-r--r--arch/x86/kvm/svm.c68
-rw-r--r--arch/x86/kvm/trace.h10
-rw-r--r--arch/x86/kvm/vmx.c15252
-rw-r--r--arch/x86/kvm/vmx/capabilities.h343
-rw-r--r--arch/x86/kvm/vmx/evmcs.c (renamed from arch/x86/kvm/vmx_evmcs.h)78
-rw-r--r--arch/x86/kvm/vmx/evmcs.h202
-rw-r--r--arch/x86/kvm/vmx/nested.c5721
-rw-r--r--arch/x86/kvm/vmx/nested.h282
-rw-r--r--arch/x86/kvm/vmx/ops.h285
-rw-r--r--arch/x86/kvm/vmx/pmu_intel.c (renamed from arch/x86/kvm/pmu_intel.c)0
-rw-r--r--arch/x86/kvm/vmx/vmcs.h136
-rw-r--r--arch/x86/kvm/vmx/vmcs12.c157
-rw-r--r--arch/x86/kvm/vmx/vmcs12.h462
-rw-r--r--arch/x86/kvm/vmx/vmcs_shadow_fields.h (renamed from arch/x86/kvm/vmx_shadow_fields.h)0
-rw-r--r--arch/x86/kvm/vmx/vmenter.S57
-rw-r--r--arch/x86/kvm/vmx/vmx.c7935
-rw-r--r--arch/x86/kvm/vmx/vmx.h519
-rw-r--r--arch/x86/kvm/x86.c161
-rw-r--r--drivers/hv/hv.c2
-rw-r--r--drivers/hv/hyperv_vmbus.h68
-rw-r--r--include/kvm/arm_arch_timer.h4
-rw-r--r--include/linux/compiler_attributes.h9
-rw-r--r--include/linux/kvm_host.h12
-rw-r--r--include/uapi/linux/kvm.h19
-rwxr-xr-xtools/kvm/kvm_stat/kvm_stat2
-rw-r--r--tools/testing/selftests/android/Makefile2
-rw-r--r--tools/testing/selftests/futex/functional/Makefile1
-rw-r--r--tools/testing/selftests/gpio/Makefile6
-rw-r--r--tools/testing/selftests/kvm/Makefile5
-rw-r--r--tools/testing/selftests/kvm/clear_dirty_log_test.c2
-rw-r--r--tools/testing/selftests/kvm/dirty_log_test.c165
-rw-r--r--tools/testing/selftests/kvm/include/kvm_util.h8
-rw-r--r--tools/testing/selftests/kvm/lib/aarch64/processor.c18
-rw-r--r--tools/testing/selftests/kvm/lib/kvm_util.c67
-rw-r--r--tools/testing/selftests/kvm/lib/kvm_util_internal.h1
-rw-r--r--tools/testing/selftests/kvm/lib/ucall.c36
-rw-r--r--tools/testing/selftests/kvm/x86_64/evmcs_test.c4
-rw-r--r--tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c157
-rw-r--r--tools/testing/selftests/kvm/x86_64/state_test.c4
-rw-r--r--tools/testing/selftests/lib.mk8
-rw-r--r--tools/testing/selftests/networking/timestamping/Makefile1
-rw-r--r--tools/testing/selftests/tc-testing/bpf/Makefile1
-rw-r--r--tools/testing/selftests/vm/Makefile1
-rw-r--r--virt/kvm/arm/arch_timer.c35
-rw-r--r--virt/kvm/arm/arm.c47
-rw-r--r--virt/kvm/arm/hyp/vgic-v3-sr.c6
-rw-r--r--virt/kvm/arm/mmio.c11
-rw-r--r--virt/kvm/arm/mmu.c390
-rw-r--r--virt/kvm/arm/trace.h18
-rw-r--r--virt/kvm/arm/vgic/vgic-mmio.c44
-rw-r--r--virt/kvm/arm/vgic/vgic.c13
-rw-r--r--virt/kvm/async_pf.c2
-rw-r--r--virt/kvm/kvm_main.c208
119 files changed, 19024 insertions, 16329 deletions
diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index cd209f7730af..356156f5c52d 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -305,6 +305,9 @@ the address space for which you want to return the dirty bitmap.
305They must be less than the value that KVM_CHECK_EXTENSION returns for 305They must be less than the value that KVM_CHECK_EXTENSION returns for
306the KVM_CAP_MULTI_ADDRESS_SPACE capability. 306the KVM_CAP_MULTI_ADDRESS_SPACE capability.
307 307
308The bits in the dirty bitmap are cleared before the ioctl returns, unless
309KVM_CAP_MANUAL_DIRTY_LOG_PROTECT is enabled. For more information,
310see the description of the capability.
308 311
3094.9 KVM_SET_MEMORY_ALIAS 3124.9 KVM_SET_MEMORY_ALIAS
310 313
@@ -1129,10 +1132,15 @@ documentation when it pops into existence).
1129 1132
11304.37 KVM_ENABLE_CAP 11334.37 KVM_ENABLE_CAP
1131 1134
1132Capability: KVM_CAP_ENABLE_CAP, KVM_CAP_ENABLE_CAP_VM 1135Capability: KVM_CAP_ENABLE_CAP
1133Architectures: x86 (only KVM_CAP_ENABLE_CAP_VM), 1136Architectures: mips, ppc, s390
1134 mips (only KVM_CAP_ENABLE_CAP), ppc, s390 1137Type: vcpu ioctl
1135Type: vcpu ioctl, vm ioctl (with KVM_CAP_ENABLE_CAP_VM) 1138Parameters: struct kvm_enable_cap (in)
1139Returns: 0 on success; -1 on error
1140
1141Capability: KVM_CAP_ENABLE_CAP_VM
1142Architectures: all
1143Type: vcpu ioctl
1136Parameters: struct kvm_enable_cap (in) 1144Parameters: struct kvm_enable_cap (in)
1137Returns: 0 on success; -1 on error 1145Returns: 0 on success; -1 on error
1138 1146
@@ -3753,6 +3761,102 @@ Coalesced pio is based on coalesced mmio. There is little difference
3753between coalesced mmio and pio except that coalesced pio records accesses 3761between coalesced mmio and pio except that coalesced pio records accesses
3754to I/O ports. 3762to I/O ports.
3755 3763
37644.117 KVM_CLEAR_DIRTY_LOG (vm ioctl)
3765
3766Capability: KVM_CAP_MANUAL_DIRTY_LOG_PROTECT
3767Architectures: x86
3768Type: vm ioctl
3769Parameters: struct kvm_dirty_log (in)
3770Returns: 0 on success, -1 on error
3771
3772/* for KVM_CLEAR_DIRTY_LOG */
3773struct kvm_clear_dirty_log {
3774 __u32 slot;
3775 __u32 num_pages;
3776 __u64 first_page;
3777 union {
3778 void __user *dirty_bitmap; /* one bit per page */
3779 __u64 padding;
3780 };
3781};
3782
3783The ioctl clears the dirty status of pages in a memory slot, according to
3784the bitmap that is passed in struct kvm_clear_dirty_log's dirty_bitmap
3785field. Bit 0 of the bitmap corresponds to page "first_page" in the
3786memory slot, and num_pages is the size in bits of the input bitmap.
3787Both first_page and num_pages must be a multiple of 64. For each bit
3788that is set in the input bitmap, the corresponding page is marked "clean"
3789in KVM's dirty bitmap, and dirty tracking is re-enabled for that page
3790(for example via write-protection, or by clearing the dirty bit in
3791a page table entry).
3792
3793If KVM_CAP_MULTI_ADDRESS_SPACE is available, bits 16-31 specifies
3794the address space for which you want to return the dirty bitmap.
3795They must be less than the value that KVM_CHECK_EXTENSION returns for
3796the KVM_CAP_MULTI_ADDRESS_SPACE capability.
3797
3798This ioctl is mostly useful when KVM_CAP_MANUAL_DIRTY_LOG_PROTECT
3799is enabled; for more information, see the description of the capability.
3800However, it can always be used as long as KVM_CHECK_EXTENSION confirms
3801that KVM_CAP_MANUAL_DIRTY_LOG_PROTECT is present.
3802
38034.118 KVM_GET_SUPPORTED_HV_CPUID
3804
3805Capability: KVM_CAP_HYPERV_CPUID
3806Architectures: x86
3807Type: vcpu ioctl
3808Parameters: struct kvm_cpuid2 (in/out)
3809Returns: 0 on success, -1 on error
3810
3811struct kvm_cpuid2 {
3812 __u32 nent;
3813 __u32 padding;
3814 struct kvm_cpuid_entry2 entries[0];
3815};
3816
3817struct kvm_cpuid_entry2 {
3818 __u32 function;
3819 __u32 index;
3820 __u32 flags;
3821 __u32 eax;
3822 __u32 ebx;
3823 __u32 ecx;
3824 __u32 edx;
3825 __u32 padding[3];
3826};
3827
3828This ioctl returns x86 cpuid features leaves related to Hyper-V emulation in
3829KVM. Userspace can use the information returned by this ioctl to construct
3830cpuid information presented to guests consuming Hyper-V enlightenments (e.g.
3831Windows or Hyper-V guests).
3832
3833CPUID feature leaves returned by this ioctl are defined by Hyper-V Top Level
3834Functional Specification (TLFS). These leaves can't be obtained with
3835KVM_GET_SUPPORTED_CPUID ioctl because some of them intersect with KVM feature
3836leaves (0x40000000, 0x40000001).
3837
3838Currently, the following list of CPUID leaves are returned:
3839 HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS
3840 HYPERV_CPUID_INTERFACE
3841 HYPERV_CPUID_VERSION
3842 HYPERV_CPUID_FEATURES
3843 HYPERV_CPUID_ENLIGHTMENT_INFO
3844 HYPERV_CPUID_IMPLEMENT_LIMITS
3845 HYPERV_CPUID_NESTED_FEATURES
3846
3847HYPERV_CPUID_NESTED_FEATURES leaf is only exposed when Enlightened VMCS was
3848enabled on the corresponding vCPU (KVM_CAP_HYPERV_ENLIGHTENED_VMCS).
3849
3850Userspace invokes KVM_GET_SUPPORTED_CPUID by passing a kvm_cpuid2 structure
3851with the 'nent' field indicating the number of entries in the variable-size
3852array 'entries'. If the number of entries is too low to describe all Hyper-V
3853feature leaves, an error (E2BIG) is returned. If the number is more or equal
3854to the number of Hyper-V feature leaves, the 'nent' field is adjusted to the
3855number of valid entries in the 'entries' array, which is then filled.
3856
3857'index' and 'flags' fields in 'struct kvm_cpuid_entry2' are currently reserved,
3858userspace should not expect to get any particular value there.
3859
37565. The kvm_run structure 38605. The kvm_run structure
3757------------------------ 3861------------------------
3758 3862
@@ -4647,6 +4751,30 @@ and injected exceptions.
4647* For the new DR6 bits, note that bit 16 is set iff the #DB exception 4751* For the new DR6 bits, note that bit 16 is set iff the #DB exception
4648 will clear DR6.RTM. 4752 will clear DR6.RTM.
4649 4753
47547.18 KVM_CAP_MANUAL_DIRTY_LOG_PROTECT
4755
4756Architectures: all
4757Parameters: args[0] whether feature should be enabled or not
4758
4759With this capability enabled, KVM_GET_DIRTY_LOG will not automatically
4760clear and write-protect all pages that are returned as dirty.
4761Rather, userspace will have to do this operation separately using
4762KVM_CLEAR_DIRTY_LOG.
4763
4764At the cost of a slightly more complicated operation, this provides better
4765scalability and responsiveness for two reasons. First,
4766KVM_CLEAR_DIRTY_LOG ioctl can operate on a 64-page granularity rather
4767than requiring to sync a full memslot; this ensures that KVM does not
4768take spinlocks for an extended period of time. Second, in some cases a
4769large amount of time can pass between a call to KVM_GET_DIRTY_LOG and
4770userspace actually using the data in the page. Pages can be modified
4771during this time, which is inefficint for both the guest and userspace:
4772the guest will incur a higher penalty due to write protection faults,
4773while userspace can see false reports of dirty pages. Manual reprotection
4774helps reducing this time, improving guest performance and reducing the
4775number of dirty log false positives.
4776
4777
46508. Other capabilities. 47788. Other capabilities.
4651---------------------- 4779----------------------
4652 4780
diff --git a/MAINTAINERS b/MAINTAINERS
index 80b377dda900..c4665d49dc50 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8309,6 +8309,7 @@ W: http://www.linux-kvm.org
8309T: git git://git.kernel.org/pub/scm/virt/kvm/kvm.git 8309T: git git://git.kernel.org/pub/scm/virt/kvm/kvm.git
8310S: Supported 8310S: Supported
8311F: arch/x86/kvm/ 8311F: arch/x86/kvm/
8312F: arch/x86/kvm/*/
8312F: arch/x86/include/uapi/asm/kvm* 8313F: arch/x86/include/uapi/asm/kvm*
8313F: arch/x86/include/asm/kvm* 8314F: arch/x86/include/asm/kvm*
8314F: arch/x86/include/asm/pvclock-abi.h 8315F: arch/x86/include/asm/pvclock-abi.h
diff --git a/arch/arm/include/asm/kvm_asm.h b/arch/arm/include/asm/kvm_asm.h
index 231e87ad45d5..35491af87985 100644
--- a/arch/arm/include/asm/kvm_asm.h
+++ b/arch/arm/include/asm/kvm_asm.h
@@ -23,6 +23,10 @@
23 23
24#define ARM_EXIT_WITH_ABORT_BIT 31 24#define ARM_EXIT_WITH_ABORT_BIT 31
25#define ARM_EXCEPTION_CODE(x) ((x) & ~(1U << ARM_EXIT_WITH_ABORT_BIT)) 25#define ARM_EXCEPTION_CODE(x) ((x) & ~(1U << ARM_EXIT_WITH_ABORT_BIT))
26#define ARM_EXCEPTION_IS_TRAP(x) \
27 (ARM_EXCEPTION_CODE((x)) == ARM_EXCEPTION_PREF_ABORT || \
28 ARM_EXCEPTION_CODE((x)) == ARM_EXCEPTION_DATA_ABORT || \
29 ARM_EXCEPTION_CODE((x)) == ARM_EXCEPTION_HVC)
26#define ARM_ABORT_PENDING(x) !!((x) & (1U << ARM_EXIT_WITH_ABORT_BIT)) 30#define ARM_ABORT_PENDING(x) !!((x) & (1U << ARM_EXIT_WITH_ABORT_BIT))
27 31
28#define ARM_EXCEPTION_RESET 0 32#define ARM_EXCEPTION_RESET 0
diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index 2184d9ddb418..ca56537b61bc 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -225,7 +225,7 @@ int __kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu,
225#define KVM_ARCH_WANT_MMU_NOTIFIER 225#define KVM_ARCH_WANT_MMU_NOTIFIER
226int kvm_unmap_hva_range(struct kvm *kvm, 226int kvm_unmap_hva_range(struct kvm *kvm,
227 unsigned long start, unsigned long end); 227 unsigned long start, unsigned long end);
228void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); 228int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
229 229
230unsigned long kvm_arm_num_regs(struct kvm_vcpu *vcpu); 230unsigned long kvm_arm_num_regs(struct kvm_vcpu *vcpu);
231int kvm_arm_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *indices); 231int kvm_arm_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *indices);
@@ -296,11 +296,6 @@ static inline void kvm_arm_init_debug(void) {}
296static inline void kvm_arm_setup_debug(struct kvm_vcpu *vcpu) {} 296static inline void kvm_arm_setup_debug(struct kvm_vcpu *vcpu) {}
297static inline void kvm_arm_clear_debug(struct kvm_vcpu *vcpu) {} 297static inline void kvm_arm_clear_debug(struct kvm_vcpu *vcpu) {}
298static inline void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu) {} 298static inline void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu) {}
299static inline bool kvm_arm_handle_step_debug(struct kvm_vcpu *vcpu,
300 struct kvm_run *run)
301{
302 return false;
303}
304 299
305int kvm_arm_vcpu_arch_set_attr(struct kvm_vcpu *vcpu, 300int kvm_arm_vcpu_arch_set_attr(struct kvm_vcpu *vcpu,
306 struct kvm_device_attr *attr); 301 struct kvm_device_attr *attr);
diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
index 1098ffc3d54b..3a875fc1b63c 100644
--- a/arch/arm/include/asm/kvm_mmu.h
+++ b/arch/arm/include/asm/kvm_mmu.h
@@ -82,6 +82,67 @@ void kvm_clear_hyp_idmap(void);
82#define kvm_mk_pud(pmdp) __pud(__pa(pmdp) | PMD_TYPE_TABLE) 82#define kvm_mk_pud(pmdp) __pud(__pa(pmdp) | PMD_TYPE_TABLE)
83#define kvm_mk_pgd(pudp) ({ BUILD_BUG(); 0; }) 83#define kvm_mk_pgd(pudp) ({ BUILD_BUG(); 0; })
84 84
85#define kvm_pfn_pte(pfn, prot) pfn_pte(pfn, prot)
86#define kvm_pfn_pmd(pfn, prot) pfn_pmd(pfn, prot)
87#define kvm_pfn_pud(pfn, prot) (__pud(0))
88
89#define kvm_pud_pfn(pud) ({ WARN_ON(1); 0; })
90
91
92#define kvm_pmd_mkhuge(pmd) pmd_mkhuge(pmd)
93/* No support for pud hugepages */
94#define kvm_pud_mkhuge(pud) ( {WARN_ON(1); pud; })
95
96/*
97 * The following kvm_*pud*() functions are provided strictly to allow
98 * sharing code with arm64. They should never be called in practice.
99 */
100static inline void kvm_set_s2pud_readonly(pud_t *pud)
101{
102 WARN_ON(1);
103}
104
105static inline bool kvm_s2pud_readonly(pud_t *pud)
106{
107 WARN_ON(1);
108 return false;
109}
110
111static inline void kvm_set_pud(pud_t *pud, pud_t new_pud)
112{
113 WARN_ON(1);
114}
115
116static inline pud_t kvm_s2pud_mkwrite(pud_t pud)
117{
118 WARN_ON(1);
119 return pud;
120}
121
122static inline pud_t kvm_s2pud_mkexec(pud_t pud)
123{
124 WARN_ON(1);
125 return pud;
126}
127
128static inline bool kvm_s2pud_exec(pud_t *pud)
129{
130 WARN_ON(1);
131 return false;
132}
133
134static inline pud_t kvm_s2pud_mkyoung(pud_t pud)
135{
136 BUG();
137 return pud;
138}
139
140static inline bool kvm_s2pud_young(pud_t pud)
141{
142 WARN_ON(1);
143 return false;
144}
145
85static inline pte_t kvm_s2pte_mkwrite(pte_t pte) 146static inline pte_t kvm_s2pte_mkwrite(pte_t pte)
86{ 147{
87 pte_val(pte) |= L_PTE_S2_RDWR; 148 pte_val(pte) |= L_PTE_S2_RDWR;
diff --git a/arch/arm/include/asm/stage2_pgtable.h b/arch/arm/include/asm/stage2_pgtable.h
index f6a7ea805232..c4b1d4fb1797 100644
--- a/arch/arm/include/asm/stage2_pgtable.h
+++ b/arch/arm/include/asm/stage2_pgtable.h
@@ -68,4 +68,12 @@ stage2_pmd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
68#define stage2_pmd_table_empty(kvm, pmdp) kvm_page_empty(pmdp) 68#define stage2_pmd_table_empty(kvm, pmdp) kvm_page_empty(pmdp)
69#define stage2_pud_table_empty(kvm, pudp) false 69#define stage2_pud_table_empty(kvm, pudp) false
70 70
71static inline bool kvm_stage2_has_pud(struct kvm *kvm)
72{
73 return false;
74}
75
76#define S2_PMD_MASK PMD_MASK
77#define S2_PMD_SIZE PMD_SIZE
78
71#endif /* __ARM_S2_PGTABLE_H_ */ 79#endif /* __ARM_S2_PGTABLE_H_ */
diff --git a/arch/arm/kvm/coproc.c b/arch/arm/kvm/coproc.c
index cb094e55dc5f..222c1635bc7a 100644
--- a/arch/arm/kvm/coproc.c
+++ b/arch/arm/kvm/coproc.c
@@ -602,8 +602,8 @@ static int emulate_cp15(struct kvm_vcpu *vcpu,
602 } 602 }
603 } else { 603 } else {
604 /* If access function fails, it should complain. */ 604 /* If access function fails, it should complain. */
605 kvm_err("Unsupported guest CP15 access at: %08lx\n", 605 kvm_err("Unsupported guest CP15 access at: %08lx [%08lx]\n",
606 *vcpu_pc(vcpu)); 606 *vcpu_pc(vcpu), *vcpu_cpsr(vcpu));
607 print_cp_instr(params); 607 print_cp_instr(params);
608 kvm_inject_undefined(vcpu); 608 kvm_inject_undefined(vcpu);
609 } 609 }
diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h
index f9123fe8fcf3..7f9d2bfcf82e 100644
--- a/arch/arm64/include/asm/kvm_arm.h
+++ b/arch/arm64/include/asm/kvm_arm.h
@@ -107,7 +107,7 @@
107 TCR_EL2_ORGN0_MASK | TCR_EL2_IRGN0_MASK | TCR_EL2_T0SZ_MASK) 107 TCR_EL2_ORGN0_MASK | TCR_EL2_IRGN0_MASK | TCR_EL2_T0SZ_MASK)
108 108
109/* VTCR_EL2 Registers bits */ 109/* VTCR_EL2 Registers bits */
110#define VTCR_EL2_RES1 (1 << 31) 110#define VTCR_EL2_RES1 (1U << 31)
111#define VTCR_EL2_HD (1 << 22) 111#define VTCR_EL2_HD (1 << 22)
112#define VTCR_EL2_HA (1 << 21) 112#define VTCR_EL2_HA (1 << 21)
113#define VTCR_EL2_PS_SHIFT TCR_EL2_PS_SHIFT 113#define VTCR_EL2_PS_SHIFT TCR_EL2_PS_SHIFT
@@ -323,10 +323,6 @@
323#define PAR_TO_HPFAR(par) \ 323#define PAR_TO_HPFAR(par) \
324 (((par) & GENMASK_ULL(PHYS_MASK_SHIFT - 1, 12)) >> 8) 324 (((par) & GENMASK_ULL(PHYS_MASK_SHIFT - 1, 12)) >> 8)
325 325
326#define kvm_arm_exception_type \
327 {0, "IRQ" }, \
328 {1, "TRAP" }
329
330#define ECN(x) { ESR_ELx_EC_##x, #x } 326#define ECN(x) { ESR_ELx_EC_##x, #x }
331 327
332#define kvm_arm_exception_class \ 328#define kvm_arm_exception_class \
diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index aea01a09eb94..f5b79e995f40 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -25,6 +25,7 @@
25 25
26#define ARM_EXIT_WITH_SERROR_BIT 31 26#define ARM_EXIT_WITH_SERROR_BIT 31
27#define ARM_EXCEPTION_CODE(x) ((x) & ~(1U << ARM_EXIT_WITH_SERROR_BIT)) 27#define ARM_EXCEPTION_CODE(x) ((x) & ~(1U << ARM_EXIT_WITH_SERROR_BIT))
28#define ARM_EXCEPTION_IS_TRAP(x) (ARM_EXCEPTION_CODE((x)) == ARM_EXCEPTION_TRAP)
28#define ARM_SERROR_PENDING(x) !!((x) & (1U << ARM_EXIT_WITH_SERROR_BIT)) 29#define ARM_SERROR_PENDING(x) !!((x) & (1U << ARM_EXIT_WITH_SERROR_BIT))
29 30
30#define ARM_EXCEPTION_IRQ 0 31#define ARM_EXCEPTION_IRQ 0
@@ -34,6 +35,12 @@
34/* The hyp-stub will return this for any kvm_call_hyp() call */ 35/* The hyp-stub will return this for any kvm_call_hyp() call */
35#define ARM_EXCEPTION_HYP_GONE HVC_STUB_ERR 36#define ARM_EXCEPTION_HYP_GONE HVC_STUB_ERR
36 37
38#define kvm_arm_exception_type \
39 {ARM_EXCEPTION_IRQ, "IRQ" }, \
40 {ARM_EXCEPTION_EL1_SERROR, "SERROR" }, \
41 {ARM_EXCEPTION_TRAP, "TRAP" }, \
42 {ARM_EXCEPTION_HYP_GONE, "HYP_GONE" }
43
37#ifndef __ASSEMBLY__ 44#ifndef __ASSEMBLY__
38 45
39#include <linux/mm.h> 46#include <linux/mm.h>
diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
index 21247870def7..506386a3edde 100644
--- a/arch/arm64/include/asm/kvm_emulate.h
+++ b/arch/arm64/include/asm/kvm_emulate.h
@@ -24,6 +24,7 @@
24 24
25#include <linux/kvm_host.h> 25#include <linux/kvm_host.h>
26 26
27#include <asm/debug-monitors.h>
27#include <asm/esr.h> 28#include <asm/esr.h>
28#include <asm/kvm_arm.h> 29#include <asm/kvm_arm.h>
29#include <asm/kvm_hyp.h> 30#include <asm/kvm_hyp.h>
@@ -147,14 +148,6 @@ static inline bool kvm_condition_valid(const struct kvm_vcpu *vcpu)
147 return true; 148 return true;
148} 149}
149 150
150static inline void kvm_skip_instr(struct kvm_vcpu *vcpu, bool is_wide_instr)
151{
152 if (vcpu_mode_is_32bit(vcpu))
153 kvm_skip_instr32(vcpu, is_wide_instr);
154 else
155 *vcpu_pc(vcpu) += 4;
156}
157
158static inline void vcpu_set_thumb(struct kvm_vcpu *vcpu) 151static inline void vcpu_set_thumb(struct kvm_vcpu *vcpu)
159{ 152{
160 *vcpu_cpsr(vcpu) |= PSR_AA32_T_BIT; 153 *vcpu_cpsr(vcpu) |= PSR_AA32_T_BIT;
@@ -424,4 +417,30 @@ static inline unsigned long vcpu_data_host_to_guest(struct kvm_vcpu *vcpu,
424 return data; /* Leave LE untouched */ 417 return data; /* Leave LE untouched */
425} 418}
426 419
420static inline void kvm_skip_instr(struct kvm_vcpu *vcpu, bool is_wide_instr)
421{
422 if (vcpu_mode_is_32bit(vcpu))
423 kvm_skip_instr32(vcpu, is_wide_instr);
424 else
425 *vcpu_pc(vcpu) += 4;
426
427 /* advance the singlestep state machine */
428 *vcpu_cpsr(vcpu) &= ~DBG_SPSR_SS;
429}
430
431/*
432 * Skip an instruction which has been emulated at hyp while most guest sysregs
433 * are live.
434 */
435static inline void __hyp_text __kvm_skip_instr(struct kvm_vcpu *vcpu)
436{
437 *vcpu_pc(vcpu) = read_sysreg_el2(elr);
438 vcpu->arch.ctxt.gp_regs.regs.pstate = read_sysreg_el2(spsr);
439
440 kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu));
441
442 write_sysreg_el2(vcpu->arch.ctxt.gp_regs.regs.pstate, spsr);
443 write_sysreg_el2(*vcpu_pc(vcpu), elr);
444}
445
427#endif /* __ARM64_KVM_EMULATE_H__ */ 446#endif /* __ARM64_KVM_EMULATE_H__ */
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 9217759afa6b..7732d0ba4e60 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -319,7 +319,7 @@ struct kvm_vcpu_arch {
319 */ 319 */
320#define __vcpu_sys_reg(v,r) ((v)->arch.ctxt.sys_regs[(r)]) 320#define __vcpu_sys_reg(v,r) ((v)->arch.ctxt.sys_regs[(r)])
321 321
322u64 vcpu_read_sys_reg(struct kvm_vcpu *vcpu, int reg); 322u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg);
323void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg); 323void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg);
324 324
325/* 325/*
@@ -360,7 +360,7 @@ int __kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu,
360#define KVM_ARCH_WANT_MMU_NOTIFIER 360#define KVM_ARCH_WANT_MMU_NOTIFIER
361int kvm_unmap_hva_range(struct kvm *kvm, 361int kvm_unmap_hva_range(struct kvm *kvm,
362 unsigned long start, unsigned long end); 362 unsigned long start, unsigned long end);
363void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); 363int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
364int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end); 364int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end);
365int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); 365int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
366 366
@@ -449,7 +449,6 @@ void kvm_arm_init_debug(void);
449void kvm_arm_setup_debug(struct kvm_vcpu *vcpu); 449void kvm_arm_setup_debug(struct kvm_vcpu *vcpu);
450void kvm_arm_clear_debug(struct kvm_vcpu *vcpu); 450void kvm_arm_clear_debug(struct kvm_vcpu *vcpu);
451void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu); 451void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu);
452bool kvm_arm_handle_step_debug(struct kvm_vcpu *vcpu, struct kvm_run *run);
453int kvm_arm_vcpu_arch_set_attr(struct kvm_vcpu *vcpu, 452int kvm_arm_vcpu_arch_set_attr(struct kvm_vcpu *vcpu,
454 struct kvm_device_attr *attr); 453 struct kvm_device_attr *attr);
455int kvm_arm_vcpu_arch_get_attr(struct kvm_vcpu *vcpu, 454int kvm_arm_vcpu_arch_get_attr(struct kvm_vcpu *vcpu,
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index 658657367f2f..8af4b1befa42 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -184,6 +184,17 @@ void kvm_clear_hyp_idmap(void);
184#define kvm_mk_pgd(pudp) \ 184#define kvm_mk_pgd(pudp) \
185 __pgd(__phys_to_pgd_val(__pa(pudp)) | PUD_TYPE_TABLE) 185 __pgd(__phys_to_pgd_val(__pa(pudp)) | PUD_TYPE_TABLE)
186 186
187#define kvm_set_pud(pudp, pud) set_pud(pudp, pud)
188
189#define kvm_pfn_pte(pfn, prot) pfn_pte(pfn, prot)
190#define kvm_pfn_pmd(pfn, prot) pfn_pmd(pfn, prot)
191#define kvm_pfn_pud(pfn, prot) pfn_pud(pfn, prot)
192
193#define kvm_pud_pfn(pud) pud_pfn(pud)
194
195#define kvm_pmd_mkhuge(pmd) pmd_mkhuge(pmd)
196#define kvm_pud_mkhuge(pud) pud_mkhuge(pud)
197
187static inline pte_t kvm_s2pte_mkwrite(pte_t pte) 198static inline pte_t kvm_s2pte_mkwrite(pte_t pte)
188{ 199{
189 pte_val(pte) |= PTE_S2_RDWR; 200 pte_val(pte) |= PTE_S2_RDWR;
@@ -196,6 +207,12 @@ static inline pmd_t kvm_s2pmd_mkwrite(pmd_t pmd)
196 return pmd; 207 return pmd;
197} 208}
198 209
210static inline pud_t kvm_s2pud_mkwrite(pud_t pud)
211{
212 pud_val(pud) |= PUD_S2_RDWR;
213 return pud;
214}
215
199static inline pte_t kvm_s2pte_mkexec(pte_t pte) 216static inline pte_t kvm_s2pte_mkexec(pte_t pte)
200{ 217{
201 pte_val(pte) &= ~PTE_S2_XN; 218 pte_val(pte) &= ~PTE_S2_XN;
@@ -208,6 +225,12 @@ static inline pmd_t kvm_s2pmd_mkexec(pmd_t pmd)
208 return pmd; 225 return pmd;
209} 226}
210 227
228static inline pud_t kvm_s2pud_mkexec(pud_t pud)
229{
230 pud_val(pud) &= ~PUD_S2_XN;
231 return pud;
232}
233
211static inline void kvm_set_s2pte_readonly(pte_t *ptep) 234static inline void kvm_set_s2pte_readonly(pte_t *ptep)
212{ 235{
213 pteval_t old_pteval, pteval; 236 pteval_t old_pteval, pteval;
@@ -246,6 +269,31 @@ static inline bool kvm_s2pmd_exec(pmd_t *pmdp)
246 return !(READ_ONCE(pmd_val(*pmdp)) & PMD_S2_XN); 269 return !(READ_ONCE(pmd_val(*pmdp)) & PMD_S2_XN);
247} 270}
248 271
272static inline void kvm_set_s2pud_readonly(pud_t *pudp)
273{
274 kvm_set_s2pte_readonly((pte_t *)pudp);
275}
276
277static inline bool kvm_s2pud_readonly(pud_t *pudp)
278{
279 return kvm_s2pte_readonly((pte_t *)pudp);
280}
281
282static inline bool kvm_s2pud_exec(pud_t *pudp)
283{
284 return !(READ_ONCE(pud_val(*pudp)) & PUD_S2_XN);
285}
286
287static inline pud_t kvm_s2pud_mkyoung(pud_t pud)
288{
289 return pud_mkyoung(pud);
290}
291
292static inline bool kvm_s2pud_young(pud_t pud)
293{
294 return pud_young(pud);
295}
296
249#define hyp_pte_table_empty(ptep) kvm_page_empty(ptep) 297#define hyp_pte_table_empty(ptep) kvm_page_empty(ptep)
250 298
251#ifdef __PAGETABLE_PMD_FOLDED 299#ifdef __PAGETABLE_PMD_FOLDED
diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h
index 54a37660b8c9..22bb3ae514f5 100644
--- a/arch/arm64/include/asm/pgtable-hwdef.h
+++ b/arch/arm64/include/asm/pgtable-hwdef.h
@@ -193,6 +193,10 @@
193#define PMD_S2_RDWR (_AT(pmdval_t, 3) << 6) /* HAP[2:1] */ 193#define PMD_S2_RDWR (_AT(pmdval_t, 3) << 6) /* HAP[2:1] */
194#define PMD_S2_XN (_AT(pmdval_t, 2) << 53) /* XN[1:0] */ 194#define PMD_S2_XN (_AT(pmdval_t, 2) << 53) /* XN[1:0] */
195 195
196#define PUD_S2_RDONLY (_AT(pudval_t, 1) << 6) /* HAP[2:1] */
197#define PUD_S2_RDWR (_AT(pudval_t, 3) << 6) /* HAP[2:1] */
198#define PUD_S2_XN (_AT(pudval_t, 2) << 53) /* XN[1:0] */
199
196/* 200/*
197 * Memory Attribute override for Stage-2 (MemAttr[3:0]) 201 * Memory Attribute override for Stage-2 (MemAttr[3:0])
198 */ 202 */
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 5bbb59c81920..de70c1eabf33 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -315,6 +315,11 @@ static inline pte_t pud_pte(pud_t pud)
315 return __pte(pud_val(pud)); 315 return __pte(pud_val(pud));
316} 316}
317 317
318static inline pud_t pte_pud(pte_t pte)
319{
320 return __pud(pte_val(pte));
321}
322
318static inline pmd_t pud_pmd(pud_t pud) 323static inline pmd_t pud_pmd(pud_t pud)
319{ 324{
320 return __pmd(pud_val(pud)); 325 return __pmd(pud_val(pud));
@@ -382,8 +387,12 @@ static inline int pmd_protnone(pmd_t pmd)
382#define pfn_pmd(pfn,prot) __pmd(__phys_to_pmd_val((phys_addr_t)(pfn) << PAGE_SHIFT) | pgprot_val(prot)) 387#define pfn_pmd(pfn,prot) __pmd(__phys_to_pmd_val((phys_addr_t)(pfn) << PAGE_SHIFT) | pgprot_val(prot))
383#define mk_pmd(page,prot) pfn_pmd(page_to_pfn(page),prot) 388#define mk_pmd(page,prot) pfn_pmd(page_to_pfn(page),prot)
384 389
390#define pud_young(pud) pte_young(pud_pte(pud))
391#define pud_mkyoung(pud) pte_pud(pte_mkyoung(pud_pte(pud)))
385#define pud_write(pud) pte_write(pud_pte(pud)) 392#define pud_write(pud) pte_write(pud_pte(pud))
386 393
394#define pud_mkhuge(pud) (__pud(pud_val(pud) & ~PUD_TABLE_BIT))
395
387#define __pud_to_phys(pud) __pte_to_phys(pud_pte(pud)) 396#define __pud_to_phys(pud) __pte_to_phys(pud_pte(pud))
388#define __phys_to_pud_val(phys) __phys_to_pte_val(phys) 397#define __phys_to_pud_val(phys) __phys_to_pte_val(phys)
389#define pud_pfn(pud) ((__pud_to_phys(pud) & PUD_MASK) >> PAGE_SHIFT) 398#define pud_pfn(pud) ((__pud_to_phys(pud) & PUD_MASK) >> PAGE_SHIFT)
diff --git a/arch/arm64/include/asm/stage2_pgtable.h b/arch/arm64/include/asm/stage2_pgtable.h
index d352f6df8d2c..5412fa40825e 100644
--- a/arch/arm64/include/asm/stage2_pgtable.h
+++ b/arch/arm64/include/asm/stage2_pgtable.h
@@ -30,16 +30,14 @@
30#define pt_levels_pgdir_shift(lvls) ARM64_HW_PGTABLE_LEVEL_SHIFT(4 - (lvls)) 30#define pt_levels_pgdir_shift(lvls) ARM64_HW_PGTABLE_LEVEL_SHIFT(4 - (lvls))
31 31
32/* 32/*
33 * The hardware supports concatenation of up to 16 tables at stage2 entry level 33 * The hardware supports concatenation of up to 16 tables at stage2 entry
34 * and we use the feature whenever possible. 34 * level and we use the feature whenever possible, which means we resolve 4
35 * additional bits of address at the entry level.
35 * 36 *
36 * Now, the minimum number of bits resolved at any level is (PAGE_SHIFT - 3). 37 * This implies, the total number of page table levels required for
37 * On arm64, the smallest PAGE_SIZE supported is 4k, which means 38 * IPA_SHIFT at stage2 expected by the hardware can be calculated using
38 * (PAGE_SHIFT - 3) > 4 holds for all page sizes. 39 * the same logic used for the (non-collapsable) stage1 page tables but for
39 * This implies, the total number of page table levels at stage2 expected 40 * (IPA_SHIFT - 4).
40 * by the hardware is actually the number of levels required for (IPA_SHIFT - 4)
41 * in normal translations(e.g, stage1), since we cannot have another level in
42 * the range (IPA_SHIFT, IPA_SHIFT - 4).
43 */ 41 */
44#define stage2_pgtable_levels(ipa) ARM64_HW_PGTABLE_LEVELS((ipa) - 4) 42#define stage2_pgtable_levels(ipa) ARM64_HW_PGTABLE_LEVELS((ipa) - 4)
45#define kvm_stage2_levels(kvm) VTCR_EL2_LVLS(kvm->arch.vtcr) 43#define kvm_stage2_levels(kvm) VTCR_EL2_LVLS(kvm->arch.vtcr)
diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c
index 00d422336a45..f39801e4136c 100644
--- a/arch/arm64/kvm/debug.c
+++ b/arch/arm64/kvm/debug.c
@@ -236,24 +236,3 @@ void kvm_arm_clear_debug(struct kvm_vcpu *vcpu)
236 } 236 }
237 } 237 }
238} 238}
239
240
241/*
242 * After successfully emulating an instruction, we might want to
243 * return to user space with a KVM_EXIT_DEBUG. We can only do this
244 * once the emulation is complete, though, so for userspace emulations
245 * we have to wait until we have re-entered KVM before calling this
246 * helper.
247 *
248 * Return true (and set exit_reason) to return to userspace or false
249 * if no further action is required.
250 */
251bool kvm_arm_handle_step_debug(struct kvm_vcpu *vcpu, struct kvm_run *run)
252{
253 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
254 run->exit_reason = KVM_EXIT_DEBUG;
255 run->debug.arch.hsr = ESR_ELx_EC_SOFTSTP_LOW << ESR_ELx_EC_SHIFT;
256 return true;
257 }
258 return false;
259}
diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c
index ab35929dcb3c..0b7983442071 100644
--- a/arch/arm64/kvm/handle_exit.c
+++ b/arch/arm64/kvm/handle_exit.c
@@ -247,13 +247,6 @@ static int handle_trap_exceptions(struct kvm_vcpu *vcpu, struct kvm_run *run)
247 handled = exit_handler(vcpu, run); 247 handled = exit_handler(vcpu, run);
248 } 248 }
249 249
250 /*
251 * kvm_arm_handle_step_debug() sets the exit_reason on the kvm_run
252 * structure if we need to return to userspace.
253 */
254 if (handled > 0 && kvm_arm_handle_step_debug(vcpu, run))
255 handled = 0;
256
257 return handled; 250 return handled;
258} 251}
259 252
@@ -287,12 +280,7 @@ int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run,
287 case ARM_EXCEPTION_IRQ: 280 case ARM_EXCEPTION_IRQ:
288 return 1; 281 return 1;
289 case ARM_EXCEPTION_EL1_SERROR: 282 case ARM_EXCEPTION_EL1_SERROR:
290 /* We may still need to return for single-step */ 283 return 1;
291 if (!(*vcpu_cpsr(vcpu) & DBG_SPSR_SS)
292 && kvm_arm_handle_step_debug(vcpu, run))
293 return 0;
294 else
295 return 1;
296 case ARM_EXCEPTION_TRAP: 284 case ARM_EXCEPTION_TRAP:
297 return handle_trap_exceptions(vcpu, run); 285 return handle_trap_exceptions(vcpu, run);
298 case ARM_EXCEPTION_HYP_GONE: 286 case ARM_EXCEPTION_HYP_GONE:
diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c
index 63ac10ead3a8..b0b1478094b4 100644
--- a/arch/arm64/kvm/hyp/switch.c
+++ b/arch/arm64/kvm/hyp/switch.c
@@ -313,33 +313,6 @@ static bool __hyp_text __populate_fault_info(struct kvm_vcpu *vcpu)
313 return true; 313 return true;
314} 314}
315 315
316/* Skip an instruction which has been emulated. Returns true if
317 * execution can continue or false if we need to exit hyp mode because
318 * single-step was in effect.
319 */
320static bool __hyp_text __skip_instr(struct kvm_vcpu *vcpu)
321{
322 *vcpu_pc(vcpu) = read_sysreg_el2(elr);
323
324 if (vcpu_mode_is_32bit(vcpu)) {
325 vcpu->arch.ctxt.gp_regs.regs.pstate = read_sysreg_el2(spsr);
326 kvm_skip_instr32(vcpu, kvm_vcpu_trap_il_is32bit(vcpu));
327 write_sysreg_el2(vcpu->arch.ctxt.gp_regs.regs.pstate, spsr);
328 } else {
329 *vcpu_pc(vcpu) += 4;
330 }
331
332 write_sysreg_el2(*vcpu_pc(vcpu), elr);
333
334 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
335 vcpu->arch.fault.esr_el2 =
336 (ESR_ELx_EC_SOFTSTP_LOW << ESR_ELx_EC_SHIFT) | 0x22;
337 return false;
338 } else {
339 return true;
340 }
341}
342
343static bool __hyp_text __hyp_switch_fpsimd(struct kvm_vcpu *vcpu) 316static bool __hyp_text __hyp_switch_fpsimd(struct kvm_vcpu *vcpu)
344{ 317{
345 struct user_fpsimd_state *host_fpsimd = vcpu->arch.host_fpsimd_state; 318 struct user_fpsimd_state *host_fpsimd = vcpu->arch.host_fpsimd_state;
@@ -428,20 +401,12 @@ static bool __hyp_text fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code)
428 if (valid) { 401 if (valid) {
429 int ret = __vgic_v2_perform_cpuif_access(vcpu); 402 int ret = __vgic_v2_perform_cpuif_access(vcpu);
430 403
431 if (ret == 1 && __skip_instr(vcpu)) 404 if (ret == 1)
432 return true; 405 return true;
433 406
434 if (ret == -1) { 407 /* Promote an illegal access to an SError.*/
435 /* Promote an illegal access to an 408 if (ret == -1)
436 * SError. If we would be returning
437 * due to single-step clear the SS
438 * bit so handle_exit knows what to
439 * do after dealing with the error.
440 */
441 if (!__skip_instr(vcpu))
442 *vcpu_cpsr(vcpu) &= ~DBG_SPSR_SS;
443 *exit_code = ARM_EXCEPTION_EL1_SERROR; 409 *exit_code = ARM_EXCEPTION_EL1_SERROR;
444 }
445 410
446 goto exit; 411 goto exit;
447 } 412 }
@@ -452,7 +417,7 @@ static bool __hyp_text fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code)
452 kvm_vcpu_trap_get_class(vcpu) == ESR_ELx_EC_CP15_32)) { 417 kvm_vcpu_trap_get_class(vcpu) == ESR_ELx_EC_CP15_32)) {
453 int ret = __vgic_v3_perform_cpuif_access(vcpu); 418 int ret = __vgic_v3_perform_cpuif_access(vcpu);
454 419
455 if (ret == 1 && __skip_instr(vcpu)) 420 if (ret == 1)
456 return true; 421 return true;
457 } 422 }
458 423
diff --git a/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c b/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c
index 215c7c0eb3b0..9cbdd034a563 100644
--- a/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c
+++ b/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c
@@ -41,7 +41,7 @@ static bool __hyp_text __is_be(struct kvm_vcpu *vcpu)
41 * Returns: 41 * Returns:
42 * 1: GICV access successfully performed 42 * 1: GICV access successfully performed
43 * 0: Not a GICV access 43 * 0: Not a GICV access
44 * -1: Illegal GICV access 44 * -1: Illegal GICV access successfully performed
45 */ 45 */
46int __hyp_text __vgic_v2_perform_cpuif_access(struct kvm_vcpu *vcpu) 46int __hyp_text __vgic_v2_perform_cpuif_access(struct kvm_vcpu *vcpu)
47{ 47{
@@ -61,12 +61,16 @@ int __hyp_text __vgic_v2_perform_cpuif_access(struct kvm_vcpu *vcpu)
61 return 0; 61 return 0;
62 62
63 /* Reject anything but a 32bit access */ 63 /* Reject anything but a 32bit access */
64 if (kvm_vcpu_dabt_get_as(vcpu) != sizeof(u32)) 64 if (kvm_vcpu_dabt_get_as(vcpu) != sizeof(u32)) {
65 __kvm_skip_instr(vcpu);
65 return -1; 66 return -1;
67 }
66 68
67 /* Not aligned? Don't bother */ 69 /* Not aligned? Don't bother */
68 if (fault_ipa & 3) 70 if (fault_ipa & 3) {
71 __kvm_skip_instr(vcpu);
69 return -1; 72 return -1;
73 }
70 74
71 rd = kvm_vcpu_dabt_get_rd(vcpu); 75 rd = kvm_vcpu_dabt_get_rd(vcpu);
72 addr = hyp_symbol_addr(kvm_vgic_global_state)->vcpu_hyp_va; 76 addr = hyp_symbol_addr(kvm_vgic_global_state)->vcpu_hyp_va;
@@ -88,5 +92,7 @@ int __hyp_text __vgic_v2_perform_cpuif_access(struct kvm_vcpu *vcpu)
88 vcpu_set_reg(vcpu, rd, data); 92 vcpu_set_reg(vcpu, rd, data);
89 } 93 }
90 94
95 __kvm_skip_instr(vcpu);
96
91 return 1; 97 return 1;
92} 98}
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 1ca592d38c3c..e3e37228ae4e 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -76,7 +76,7 @@ static bool write_to_read_only(struct kvm_vcpu *vcpu,
76 return false; 76 return false;
77} 77}
78 78
79u64 vcpu_read_sys_reg(struct kvm_vcpu *vcpu, int reg) 79u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg)
80{ 80{
81 if (!vcpu->arch.sysregs_loaded_on_cpu) 81 if (!vcpu->arch.sysregs_loaded_on_cpu)
82 goto immediate_read; 82 goto immediate_read;
@@ -1858,6 +1858,8 @@ static void perform_access(struct kvm_vcpu *vcpu,
1858 struct sys_reg_params *params, 1858 struct sys_reg_params *params,
1859 const struct sys_reg_desc *r) 1859 const struct sys_reg_desc *r)
1860{ 1860{
1861 trace_kvm_sys_access(*vcpu_pc(vcpu), params, r);
1862
1861 /* 1863 /*
1862 * Not having an accessor means that we have configured a trap 1864 * Not having an accessor means that we have configured a trap
1863 * that we don't know how to handle. This certainly qualifies 1865 * that we don't know how to handle. This certainly qualifies
@@ -1920,8 +1922,8 @@ static void unhandled_cp_access(struct kvm_vcpu *vcpu,
1920 WARN_ON(1); 1922 WARN_ON(1);
1921 } 1923 }
1922 1924
1923 kvm_err("Unsupported guest CP%d access at: %08lx\n", 1925 kvm_err("Unsupported guest CP%d access at: %08lx [%08lx]\n",
1924 cp, *vcpu_pc(vcpu)); 1926 cp, *vcpu_pc(vcpu), *vcpu_cpsr(vcpu));
1925 print_sys_reg_instr(params); 1927 print_sys_reg_instr(params);
1926 kvm_inject_undefined(vcpu); 1928 kvm_inject_undefined(vcpu);
1927} 1929}
@@ -2071,8 +2073,8 @@ static int emulate_sys_reg(struct kvm_vcpu *vcpu,
2071 if (likely(r)) { 2073 if (likely(r)) {
2072 perform_access(vcpu, params, r); 2074 perform_access(vcpu, params, r);
2073 } else { 2075 } else {
2074 kvm_err("Unsupported guest sys_reg access at: %lx\n", 2076 kvm_err("Unsupported guest sys_reg access at: %lx [%08lx]\n",
2075 *vcpu_pc(vcpu)); 2077 *vcpu_pc(vcpu), *vcpu_cpsr(vcpu));
2076 print_sys_reg_instr(params); 2078 print_sys_reg_instr(params);
2077 kvm_inject_undefined(vcpu); 2079 kvm_inject_undefined(vcpu);
2078 } 2080 }
diff --git a/arch/arm64/kvm/sys_regs.h b/arch/arm64/kvm/sys_regs.h
index cd710f8b63e0..3b1bc7f01d0b 100644
--- a/arch/arm64/kvm/sys_regs.h
+++ b/arch/arm64/kvm/sys_regs.h
@@ -35,6 +35,9 @@ struct sys_reg_params {
35}; 35};
36 36
37struct sys_reg_desc { 37struct sys_reg_desc {
38 /* Sysreg string for debug */
39 const char *name;
40
38 /* MRS/MSR instruction which accesses it. */ 41 /* MRS/MSR instruction which accesses it. */
39 u8 Op0; 42 u8 Op0;
40 u8 Op1; 43 u8 Op1;
@@ -130,6 +133,7 @@ const struct sys_reg_desc *find_reg_by_id(u64 id,
130#define Op2(_x) .Op2 = _x 133#define Op2(_x) .Op2 = _x
131 134
132#define SYS_DESC(reg) \ 135#define SYS_DESC(reg) \
136 .name = #reg, \
133 Op0(sys_reg_Op0(reg)), Op1(sys_reg_Op1(reg)), \ 137 Op0(sys_reg_Op0(reg)), Op1(sys_reg_Op1(reg)), \
134 CRn(sys_reg_CRn(reg)), CRm(sys_reg_CRm(reg)), \ 138 CRn(sys_reg_CRn(reg)), CRm(sys_reg_CRm(reg)), \
135 Op2(sys_reg_Op2(reg)) 139 Op2(sys_reg_Op2(reg))
diff --git a/arch/arm64/kvm/trace.h b/arch/arm64/kvm/trace.h
index 3b82fb1ddd09..eab91ad0effb 100644
--- a/arch/arm64/kvm/trace.h
+++ b/arch/arm64/kvm/trace.h
@@ -3,6 +3,7 @@
3#define _TRACE_ARM64_KVM_H 3#define _TRACE_ARM64_KVM_H
4 4
5#include <linux/tracepoint.h> 5#include <linux/tracepoint.h>
6#include "sys_regs.h"
6 7
7#undef TRACE_SYSTEM 8#undef TRACE_SYSTEM
8#define TRACE_SYSTEM kvm 9#define TRACE_SYSTEM kvm
@@ -152,6 +153,40 @@ TRACE_EVENT(kvm_handle_sys_reg,
152 TP_printk("HSR 0x%08lx", __entry->hsr) 153 TP_printk("HSR 0x%08lx", __entry->hsr)
153); 154);
154 155
156TRACE_EVENT(kvm_sys_access,
157 TP_PROTO(unsigned long vcpu_pc, struct sys_reg_params *params, const struct sys_reg_desc *reg),
158 TP_ARGS(vcpu_pc, params, reg),
159
160 TP_STRUCT__entry(
161 __field(unsigned long, vcpu_pc)
162 __field(bool, is_write)
163 __field(const char *, name)
164 __field(u8, Op0)
165 __field(u8, Op1)
166 __field(u8, CRn)
167 __field(u8, CRm)
168 __field(u8, Op2)
169 ),
170
171 TP_fast_assign(
172 __entry->vcpu_pc = vcpu_pc;
173 __entry->is_write = params->is_write;
174 __entry->name = reg->name;
175 __entry->Op0 = reg->Op0;
176 __entry->Op0 = reg->Op0;
177 __entry->Op1 = reg->Op1;
178 __entry->CRn = reg->CRn;
179 __entry->CRm = reg->CRm;
180 __entry->Op2 = reg->Op2;
181 ),
182
183 TP_printk("PC: %lx %s (%d,%d,%d,%d,%d) %s",
184 __entry->vcpu_pc, __entry->name ?: "UNKN",
185 __entry->Op0, __entry->Op1, __entry->CRn,
186 __entry->CRm, __entry->Op2,
187 __entry->is_write ? "write" : "read")
188);
189
155TRACE_EVENT(kvm_set_guest_debug, 190TRACE_EVENT(kvm_set_guest_debug,
156 TP_PROTO(struct kvm_vcpu *vcpu, __u32 guest_debug), 191 TP_PROTO(struct kvm_vcpu *vcpu, __u32 guest_debug),
157 TP_ARGS(vcpu, guest_debug), 192 TP_ARGS(vcpu, guest_debug),
diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index e445026858bc..d2abd98471e8 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -936,7 +936,7 @@ enum kvm_mips_fault_result kvm_trap_emul_gva_fault(struct kvm_vcpu *vcpu,
936#define KVM_ARCH_WANT_MMU_NOTIFIER 936#define KVM_ARCH_WANT_MMU_NOTIFIER
937int kvm_unmap_hva_range(struct kvm *kvm, 937int kvm_unmap_hva_range(struct kvm *kvm,
938 unsigned long start, unsigned long end); 938 unsigned long start, unsigned long end);
939void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); 939int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
940int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end); 940int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end);
941int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); 941int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
942 942
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index 1fcc4d149054..3734cd58895e 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -1004,14 +1004,37 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
1004{ 1004{
1005 struct kvm_memslots *slots; 1005 struct kvm_memslots *slots;
1006 struct kvm_memory_slot *memslot; 1006 struct kvm_memory_slot *memslot;
1007 bool is_dirty = false; 1007 bool flush = false;
1008 int r; 1008 int r;
1009 1009
1010 mutex_lock(&kvm->slots_lock); 1010 mutex_lock(&kvm->slots_lock);
1011 1011
1012 r = kvm_get_dirty_log_protect(kvm, log, &is_dirty); 1012 r = kvm_get_dirty_log_protect(kvm, log, &flush);
1013 1013
1014 if (is_dirty) { 1014 if (flush) {
1015 slots = kvm_memslots(kvm);
1016 memslot = id_to_memslot(slots, log->slot);
1017
1018 /* Let implementation handle TLB/GVA invalidation */
1019 kvm_mips_callbacks->flush_shadow_memslot(kvm, memslot);
1020 }
1021
1022 mutex_unlock(&kvm->slots_lock);
1023 return r;
1024}
1025
1026int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm, struct kvm_clear_dirty_log *log)
1027{
1028 struct kvm_memslots *slots;
1029 struct kvm_memory_slot *memslot;
1030 bool flush = false;
1031 int r;
1032
1033 mutex_lock(&kvm->slots_lock);
1034
1035 r = kvm_clear_dirty_log_protect(kvm, log, &flush);
1036
1037 if (flush) {
1015 slots = kvm_memslots(kvm); 1038 slots = kvm_memslots(kvm);
1016 memslot = id_to_memslot(slots, log->slot); 1039 memslot = id_to_memslot(slots, log->slot);
1017 1040
diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c
index d8dcdb350405..97e538a8c1be 100644
--- a/arch/mips/kvm/mmu.c
+++ b/arch/mips/kvm/mmu.c
@@ -551,7 +551,7 @@ static int kvm_set_spte_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end,
551 (pte_dirty(old_pte) && !pte_dirty(hva_pte)); 551 (pte_dirty(old_pte) && !pte_dirty(hva_pte));
552} 552}
553 553
554void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) 554int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
555{ 555{
556 unsigned long end = hva + PAGE_SIZE; 556 unsigned long end = hva + PAGE_SIZE;
557 int ret; 557 int ret;
@@ -559,6 +559,7 @@ void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
559 ret = handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &pte); 559 ret = handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &pte);
560 if (ret) 560 if (ret)
561 kvm_mips_callbacks->flush_shadow_all(kvm); 561 kvm_mips_callbacks->flush_shadow_all(kvm);
562 return 0;
562} 563}
563 564
564static int kvm_age_hva_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end, 565static int kvm_age_hva_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end,
diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h
index 33a4fc891947..463c63a9fcf1 100644
--- a/arch/powerpc/include/asm/hvcall.h
+++ b/arch/powerpc/include/asm/hvcall.h
@@ -335,6 +335,7 @@
335#define H_SET_PARTITION_TABLE 0xF800 335#define H_SET_PARTITION_TABLE 0xF800
336#define H_ENTER_NESTED 0xF804 336#define H_ENTER_NESTED 0xF804
337#define H_TLB_INVALIDATE 0xF808 337#define H_TLB_INVALIDATE 0xF808
338#define H_COPY_TOFROM_GUEST 0xF80C
338 339
339/* Values for 2nd argument to H_SET_MODE */ 340/* Values for 2nd argument to H_SET_MODE */
340#define H_SET_MODE_RESOURCE_SET_CIABR 1 341#define H_SET_MODE_RESOURCE_SET_CIABR 1
diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index 09f8e9ba69bc..38f1b879f569 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -188,6 +188,13 @@ extern int kvmppc_book3s_hcall_implemented(struct kvm *kvm, unsigned long hc);
188extern int kvmppc_book3s_radix_page_fault(struct kvm_run *run, 188extern int kvmppc_book3s_radix_page_fault(struct kvm_run *run,
189 struct kvm_vcpu *vcpu, 189 struct kvm_vcpu *vcpu,
190 unsigned long ea, unsigned long dsisr); 190 unsigned long ea, unsigned long dsisr);
191extern unsigned long __kvmhv_copy_tofrom_guest_radix(int lpid, int pid,
192 gva_t eaddr, void *to, void *from,
193 unsigned long n);
194extern long kvmhv_copy_from_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr,
195 void *to, unsigned long n);
196extern long kvmhv_copy_to_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr,
197 void *from, unsigned long n);
191extern int kvmppc_mmu_walk_radix_tree(struct kvm_vcpu *vcpu, gva_t eaddr, 198extern int kvmppc_mmu_walk_radix_tree(struct kvm_vcpu *vcpu, gva_t eaddr,
192 struct kvmppc_pte *gpte, u64 root, 199 struct kvmppc_pte *gpte, u64 root,
193 u64 *pte_ret_p); 200 u64 *pte_ret_p);
@@ -196,8 +203,11 @@ extern int kvmppc_mmu_radix_translate_table(struct kvm_vcpu *vcpu, gva_t eaddr,
196 int table_index, u64 *pte_ret_p); 203 int table_index, u64 *pte_ret_p);
197extern int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, 204extern int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
198 struct kvmppc_pte *gpte, bool data, bool iswrite); 205 struct kvmppc_pte *gpte, bool data, bool iswrite);
206extern void kvmppc_radix_tlbie_page(struct kvm *kvm, unsigned long addr,
207 unsigned int pshift, unsigned int lpid);
199extern void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte, unsigned long gpa, 208extern void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte, unsigned long gpa,
200 unsigned int shift, struct kvm_memory_slot *memslot, 209 unsigned int shift,
210 const struct kvm_memory_slot *memslot,
201 unsigned int lpid); 211 unsigned int lpid);
202extern bool kvmppc_hv_handle_set_rc(struct kvm *kvm, pgd_t *pgtable, 212extern bool kvmppc_hv_handle_set_rc(struct kvm *kvm, pgd_t *pgtable,
203 bool writing, unsigned long gpa, 213 bool writing, unsigned long gpa,
@@ -215,16 +225,14 @@ extern int kvmppc_radix_init(void);
215extern void kvmppc_radix_exit(void); 225extern void kvmppc_radix_exit(void);
216extern int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, 226extern int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
217 unsigned long gfn); 227 unsigned long gfn);
218extern void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte,
219 unsigned long gpa, unsigned int shift,
220 struct kvm_memory_slot *memslot,
221 unsigned int lpid);
222extern int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, 228extern int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
223 unsigned long gfn); 229 unsigned long gfn);
224extern int kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, 230extern int kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
225 unsigned long gfn); 231 unsigned long gfn);
226extern long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm, 232extern long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm,
227 struct kvm_memory_slot *memslot, unsigned long *map); 233 struct kvm_memory_slot *memslot, unsigned long *map);
234extern void kvmppc_radix_flush_memslot(struct kvm *kvm,
235 const struct kvm_memory_slot *memslot);
228extern int kvmhv_get_rmmu_info(struct kvm *kvm, struct kvm_ppc_rmmu_info *info); 236extern int kvmhv_get_rmmu_info(struct kvm *kvm, struct kvm_ppc_rmmu_info *info);
229 237
230/* XXX remove this export when load_last_inst() is generic */ 238/* XXX remove this export when load_last_inst() is generic */
@@ -242,7 +250,7 @@ extern kvm_pfn_t kvmppc_gpa_to_pfn(struct kvm_vcpu *vcpu, gpa_t gpa,
242 bool writing, bool *writable); 250 bool writing, bool *writable);
243extern void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev, 251extern void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev,
244 unsigned long *rmap, long pte_index, int realmode); 252 unsigned long *rmap, long pte_index, int realmode);
245extern void kvmppc_update_dirty_map(struct kvm_memory_slot *memslot, 253extern void kvmppc_update_dirty_map(const struct kvm_memory_slot *memslot,
246 unsigned long gfn, unsigned long psize); 254 unsigned long gfn, unsigned long psize);
247extern void kvmppc_invalidate_hpte(struct kvm *kvm, __be64 *hptep, 255extern void kvmppc_invalidate_hpte(struct kvm *kvm, __be64 *hptep,
248 unsigned long pte_index); 256 unsigned long pte_index);
@@ -298,6 +306,7 @@ long kvmhv_nested_init(void);
298void kvmhv_nested_exit(void); 306void kvmhv_nested_exit(void);
299void kvmhv_vm_nested_init(struct kvm *kvm); 307void kvmhv_vm_nested_init(struct kvm *kvm);
300long kvmhv_set_partition_table(struct kvm_vcpu *vcpu); 308long kvmhv_set_partition_table(struct kvm_vcpu *vcpu);
309long kvmhv_copy_tofrom_guest_nested(struct kvm_vcpu *vcpu);
301void kvmhv_set_ptbl_entry(unsigned int lpid, u64 dw0, u64 dw1); 310void kvmhv_set_ptbl_entry(unsigned int lpid, u64 dw0, u64 dw1);
302void kvmhv_release_all_nested(struct kvm *kvm); 311void kvmhv_release_all_nested(struct kvm *kvm);
303long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu); 312long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu);
@@ -307,7 +316,7 @@ int kvmhv_run_single_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu,
307void kvmhv_save_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr); 316void kvmhv_save_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr);
308void kvmhv_restore_hv_return_state(struct kvm_vcpu *vcpu, 317void kvmhv_restore_hv_return_state(struct kvm_vcpu *vcpu,
309 struct hv_guest_state *hr); 318 struct hv_guest_state *hr);
310long int kvmhv_nested_page_fault(struct kvm_vcpu *vcpu); 319long int kvmhv_nested_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu);
311 320
312void kvmppc_giveup_fac(struct kvm_vcpu *vcpu, ulong fac); 321void kvmppc_giveup_fac(struct kvm_vcpu *vcpu, ulong fac);
313 322
diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
index 6d298145d564..21b1ed5df888 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -55,6 +55,7 @@ struct kvm_nested_guest {
55 cpumask_t need_tlb_flush; 55 cpumask_t need_tlb_flush;
56 cpumask_t cpu_in_guest; 56 cpumask_t cpu_in_guest;
57 short prev_cpu[NR_CPUS]; 57 short prev_cpu[NR_CPUS];
58 u8 radix; /* is this nested guest radix */
58}; 59};
59 60
60/* 61/*
@@ -150,6 +151,18 @@ static inline bool kvm_is_radix(struct kvm *kvm)
150 return kvm->arch.radix; 151 return kvm->arch.radix;
151} 152}
152 153
154static inline bool kvmhv_vcpu_is_radix(struct kvm_vcpu *vcpu)
155{
156 bool radix;
157
158 if (vcpu->arch.nested)
159 radix = vcpu->arch.nested->radix;
160 else
161 radix = kvm_is_radix(vcpu->kvm);
162
163 return radix;
164}
165
153#define KVM_DEFAULT_HPT_ORDER 24 /* 16MB HPT by default */ 166#define KVM_DEFAULT_HPT_ORDER 24 /* 16MB HPT by default */
154#endif 167#endif
155 168
@@ -624,8 +637,11 @@ extern int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte,
624 unsigned long *rmapp, struct rmap_nested **n_rmap); 637 unsigned long *rmapp, struct rmap_nested **n_rmap);
625extern void kvmhv_insert_nest_rmap(struct kvm *kvm, unsigned long *rmapp, 638extern void kvmhv_insert_nest_rmap(struct kvm *kvm, unsigned long *rmapp,
626 struct rmap_nested **n_rmap); 639 struct rmap_nested **n_rmap);
640extern void kvmhv_update_nest_rmap_rc_list(struct kvm *kvm, unsigned long *rmapp,
641 unsigned long clr, unsigned long set,
642 unsigned long hpa, unsigned long nbytes);
627extern void kvmhv_remove_nest_rmap_range(struct kvm *kvm, 643extern void kvmhv_remove_nest_rmap_range(struct kvm *kvm,
628 struct kvm_memory_slot *memslot, 644 const struct kvm_memory_slot *memslot,
629 unsigned long gpa, unsigned long hpa, 645 unsigned long gpa, unsigned long hpa,
630 unsigned long nbytes); 646 unsigned long nbytes);
631 647
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index fac6f631ed29..0f98f00da2ea 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -72,7 +72,7 @@ extern int kvm_unmap_hva_range(struct kvm *kvm,
72 unsigned long start, unsigned long end); 72 unsigned long start, unsigned long end);
73extern int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end); 73extern int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end);
74extern int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); 74extern int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
75extern void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); 75extern int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
76 76
77#define HPTEG_CACHE_NUM (1 << 15) 77#define HPTEG_CACHE_NUM (1 << 15)
78#define HPTEG_HASH_BITS_PTE 13 78#define HPTEG_HASH_BITS_PTE 13
@@ -793,6 +793,7 @@ struct kvm_vcpu_arch {
793 /* For support of nested guests */ 793 /* For support of nested guests */
794 struct kvm_nested_guest *nested; 794 struct kvm_nested_guest *nested;
795 u32 nested_vcpu_id; 795 u32 nested_vcpu_id;
796 gpa_t nested_io_gpr;
796#endif 797#endif
797 798
798#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING 799#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
@@ -827,6 +828,8 @@ struct kvm_vcpu_arch {
827#define KVM_MMIO_REG_FQPR 0x00c0 828#define KVM_MMIO_REG_FQPR 0x00c0
828#define KVM_MMIO_REG_VSX 0x0100 829#define KVM_MMIO_REG_VSX 0x0100
829#define KVM_MMIO_REG_VMX 0x0180 830#define KVM_MMIO_REG_VMX 0x0180
831#define KVM_MMIO_REG_NESTED_GPR 0xffc0
832
830 833
831#define __KVM_HAVE_ARCH_WQP 834#define __KVM_HAVE_ARCH_WQP
832#define __KVM_HAVE_CREATE_DEVICE 835#define __KVM_HAVE_CREATE_DEVICE
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 9b89b1918dfc..eb0d79f0ca45 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -224,7 +224,8 @@ extern int kvmppc_core_prepare_memory_region(struct kvm *kvm,
224extern void kvmppc_core_commit_memory_region(struct kvm *kvm, 224extern void kvmppc_core_commit_memory_region(struct kvm *kvm,
225 const struct kvm_userspace_memory_region *mem, 225 const struct kvm_userspace_memory_region *mem,
226 const struct kvm_memory_slot *old, 226 const struct kvm_memory_slot *old,
227 const struct kvm_memory_slot *new); 227 const struct kvm_memory_slot *new,
228 enum kvm_mr_change change);
228extern int kvm_vm_ioctl_get_smmu_info(struct kvm *kvm, 229extern int kvm_vm_ioctl_get_smmu_info(struct kvm *kvm,
229 struct kvm_ppc_smmu_info *info); 230 struct kvm_ppc_smmu_info *info);
230extern void kvmppc_core_flush_memslot(struct kvm *kvm, 231extern void kvmppc_core_flush_memslot(struct kvm *kvm,
@@ -294,7 +295,8 @@ struct kvmppc_ops {
294 void (*commit_memory_region)(struct kvm *kvm, 295 void (*commit_memory_region)(struct kvm *kvm,
295 const struct kvm_userspace_memory_region *mem, 296 const struct kvm_userspace_memory_region *mem,
296 const struct kvm_memory_slot *old, 297 const struct kvm_memory_slot *old,
297 const struct kvm_memory_slot *new); 298 const struct kvm_memory_slot *new,
299 enum kvm_mr_change change);
298 int (*unmap_hva_range)(struct kvm *kvm, unsigned long start, 300 int (*unmap_hva_range)(struct kvm *kvm, unsigned long start,
299 unsigned long end); 301 unsigned long end);
300 int (*age_hva)(struct kvm *kvm, unsigned long start, unsigned long end); 302 int (*age_hva)(struct kvm *kvm, unsigned long start, unsigned long end);
@@ -326,6 +328,10 @@ struct kvmppc_ops {
326 unsigned long flags); 328 unsigned long flags);
327 void (*giveup_ext)(struct kvm_vcpu *vcpu, ulong msr); 329 void (*giveup_ext)(struct kvm_vcpu *vcpu, ulong msr);
328 int (*enable_nested)(struct kvm *kvm); 330 int (*enable_nested)(struct kvm *kvm);
331 int (*load_from_eaddr)(struct kvm_vcpu *vcpu, ulong *eaddr, void *ptr,
332 int size);
333 int (*store_to_eaddr)(struct kvm_vcpu *vcpu, ulong *eaddr, void *ptr,
334 int size);
329}; 335};
330 336
331extern struct kvmppc_ops *kvmppc_hv_ops; 337extern struct kvmppc_ops *kvmppc_hv_ops;
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 89d32bb79d5e..db2691ff4c0b 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -995,7 +995,16 @@ EXC_COMMON_BEGIN(h_data_storage_common)
995 bl save_nvgprs 995 bl save_nvgprs
996 RECONCILE_IRQ_STATE(r10, r11) 996 RECONCILE_IRQ_STATE(r10, r11)
997 addi r3,r1,STACK_FRAME_OVERHEAD 997 addi r3,r1,STACK_FRAME_OVERHEAD
998BEGIN_MMU_FTR_SECTION
999 ld r4,PACA_EXGEN+EX_DAR(r13)
1000 lwz r5,PACA_EXGEN+EX_DSISR(r13)
1001 std r4,_DAR(r1)
1002 std r5,_DSISR(r1)
1003 li r5,SIGSEGV
1004 bl bad_page_fault
1005MMU_FTR_SECTION_ELSE
998 bl unknown_exception 1006 bl unknown_exception
1007ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_TYPE_RADIX)
999 b ret_from_except 1008 b ret_from_except
1000 1009
1001 1010
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index fd9893bc7aa1..bd1a677dd9e4 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -830,9 +830,10 @@ int kvmppc_core_prepare_memory_region(struct kvm *kvm,
830void kvmppc_core_commit_memory_region(struct kvm *kvm, 830void kvmppc_core_commit_memory_region(struct kvm *kvm,
831 const struct kvm_userspace_memory_region *mem, 831 const struct kvm_userspace_memory_region *mem,
832 const struct kvm_memory_slot *old, 832 const struct kvm_memory_slot *old,
833 const struct kvm_memory_slot *new) 833 const struct kvm_memory_slot *new,
834 enum kvm_mr_change change)
834{ 835{
835 kvm->arch.kvm_ops->commit_memory_region(kvm, mem, old, new); 836 kvm->arch.kvm_ops->commit_memory_region(kvm, mem, old, new, change);
836} 837}
837 838
838int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end) 839int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end)
@@ -850,9 +851,10 @@ int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
850 return kvm->arch.kvm_ops->test_age_hva(kvm, hva); 851 return kvm->arch.kvm_ops->test_age_hva(kvm, hva);
851} 852}
852 853
853void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) 854int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
854{ 855{
855 kvm->arch.kvm_ops->set_spte_hva(kvm, hva, pte); 856 kvm->arch.kvm_ops->set_spte_hva(kvm, hva, pte);
857 return 0;
856} 858}
857 859
858void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu) 860void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu)
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index c615617e78ac..6f2d2fb4e098 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -743,12 +743,15 @@ void kvmppc_rmap_reset(struct kvm *kvm)
743 srcu_idx = srcu_read_lock(&kvm->srcu); 743 srcu_idx = srcu_read_lock(&kvm->srcu);
744 slots = kvm_memslots(kvm); 744 slots = kvm_memslots(kvm);
745 kvm_for_each_memslot(memslot, slots) { 745 kvm_for_each_memslot(memslot, slots) {
746 /* Mutual exclusion with kvm_unmap_hva_range etc. */
747 spin_lock(&kvm->mmu_lock);
746 /* 748 /*
747 * This assumes it is acceptable to lose reference and 749 * This assumes it is acceptable to lose reference and
748 * change bits across a reset. 750 * change bits across a reset.
749 */ 751 */
750 memset(memslot->arch.rmap, 0, 752 memset(memslot->arch.rmap, 0,
751 memslot->npages * sizeof(*memslot->arch.rmap)); 753 memslot->npages * sizeof(*memslot->arch.rmap));
754 spin_unlock(&kvm->mmu_lock);
752 } 755 }
753 srcu_read_unlock(&kvm->srcu, srcu_idx); 756 srcu_read_unlock(&kvm->srcu, srcu_idx);
754} 757}
@@ -896,11 +899,12 @@ void kvmppc_core_flush_memslot_hv(struct kvm *kvm,
896 899
897 gfn = memslot->base_gfn; 900 gfn = memslot->base_gfn;
898 rmapp = memslot->arch.rmap; 901 rmapp = memslot->arch.rmap;
902 if (kvm_is_radix(kvm)) {
903 kvmppc_radix_flush_memslot(kvm, memslot);
904 return;
905 }
906
899 for (n = memslot->npages; n; --n, ++gfn) { 907 for (n = memslot->npages; n; --n, ++gfn) {
900 if (kvm_is_radix(kvm)) {
901 kvm_unmap_radix(kvm, memslot, gfn);
902 continue;
903 }
904 /* 908 /*
905 * Testing the present bit without locking is OK because 909 * Testing the present bit without locking is OK because
906 * the memslot has been marked invalid already, and hence 910 * the memslot has been marked invalid already, and hence
diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c
index d68162ee159b..fb88167a402a 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
@@ -29,6 +29,103 @@
29 */ 29 */
30static int p9_supported_radix_bits[4] = { 5, 9, 9, 13 }; 30static int p9_supported_radix_bits[4] = { 5, 9, 9, 13 };
31 31
32unsigned long __kvmhv_copy_tofrom_guest_radix(int lpid, int pid,
33 gva_t eaddr, void *to, void *from,
34 unsigned long n)
35{
36 unsigned long quadrant, ret = n;
37 int old_pid, old_lpid;
38 bool is_load = !!to;
39
40 /* Can't access quadrants 1 or 2 in non-HV mode, call the HV to do it */
41 if (kvmhv_on_pseries())
42 return plpar_hcall_norets(H_COPY_TOFROM_GUEST, lpid, pid, eaddr,
43 __pa(to), __pa(from), n);
44
45 quadrant = 1;
46 if (!pid)
47 quadrant = 2;
48 if (is_load)
49 from = (void *) (eaddr | (quadrant << 62));
50 else
51 to = (void *) (eaddr | (quadrant << 62));
52
53 preempt_disable();
54
55 /* switch the lpid first to avoid running host with unallocated pid */
56 old_lpid = mfspr(SPRN_LPID);
57 if (old_lpid != lpid)
58 mtspr(SPRN_LPID, lpid);
59 if (quadrant == 1) {
60 old_pid = mfspr(SPRN_PID);
61 if (old_pid != pid)
62 mtspr(SPRN_PID, pid);
63 }
64 isync();
65
66 pagefault_disable();
67 if (is_load)
68 ret = raw_copy_from_user(to, from, n);
69 else
70 ret = raw_copy_to_user(to, from, n);
71 pagefault_enable();
72
73 /* switch the pid first to avoid running host with unallocated pid */
74 if (quadrant == 1 && pid != old_pid)
75 mtspr(SPRN_PID, old_pid);
76 if (lpid != old_lpid)
77 mtspr(SPRN_LPID, old_lpid);
78 isync();
79
80 preempt_enable();
81
82 return ret;
83}
84EXPORT_SYMBOL_GPL(__kvmhv_copy_tofrom_guest_radix);
85
86static long kvmhv_copy_tofrom_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr,
87 void *to, void *from, unsigned long n)
88{
89 int lpid = vcpu->kvm->arch.lpid;
90 int pid = vcpu->arch.pid;
91
92 /* This would cause a data segment intr so don't allow the access */
93 if (eaddr & (0x3FFUL << 52))
94 return -EINVAL;
95
96 /* Should we be using the nested lpid */
97 if (vcpu->arch.nested)
98 lpid = vcpu->arch.nested->shadow_lpid;
99
100 /* If accessing quadrant 3 then pid is expected to be 0 */
101 if (((eaddr >> 62) & 0x3) == 0x3)
102 pid = 0;
103
104 eaddr &= ~(0xFFFUL << 52);
105
106 return __kvmhv_copy_tofrom_guest_radix(lpid, pid, eaddr, to, from, n);
107}
108
109long kvmhv_copy_from_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr, void *to,
110 unsigned long n)
111{
112 long ret;
113
114 ret = kvmhv_copy_tofrom_guest_radix(vcpu, eaddr, to, NULL, n);
115 if (ret > 0)
116 memset(to + (n - ret), 0, ret);
117
118 return ret;
119}
120EXPORT_SYMBOL_GPL(kvmhv_copy_from_guest_radix);
121
122long kvmhv_copy_to_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr, void *from,
123 unsigned long n)
124{
125 return kvmhv_copy_tofrom_guest_radix(vcpu, eaddr, NULL, from, n);
126}
127EXPORT_SYMBOL_GPL(kvmhv_copy_to_guest_radix);
128
32int kvmppc_mmu_walk_radix_tree(struct kvm_vcpu *vcpu, gva_t eaddr, 129int kvmppc_mmu_walk_radix_tree(struct kvm_vcpu *vcpu, gva_t eaddr,
33 struct kvmppc_pte *gpte, u64 root, 130 struct kvmppc_pte *gpte, u64 root,
34 u64 *pte_ret_p) 131 u64 *pte_ret_p)
@@ -197,8 +294,8 @@ int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
197 return 0; 294 return 0;
198} 295}
199 296
200static void kvmppc_radix_tlbie_page(struct kvm *kvm, unsigned long addr, 297void kvmppc_radix_tlbie_page(struct kvm *kvm, unsigned long addr,
201 unsigned int pshift, unsigned int lpid) 298 unsigned int pshift, unsigned int lpid)
202{ 299{
203 unsigned long psize = PAGE_SIZE; 300 unsigned long psize = PAGE_SIZE;
204 int psi; 301 int psi;
@@ -284,7 +381,8 @@ static void kvmppc_pmd_free(pmd_t *pmdp)
284 381
285/* Called with kvm->mmu_lock held */ 382/* Called with kvm->mmu_lock held */
286void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte, unsigned long gpa, 383void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte, unsigned long gpa,
287 unsigned int shift, struct kvm_memory_slot *memslot, 384 unsigned int shift,
385 const struct kvm_memory_slot *memslot,
288 unsigned int lpid) 386 unsigned int lpid)
289 387
290{ 388{
@@ -683,6 +781,7 @@ int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu,
683 pte_t pte, *ptep; 781 pte_t pte, *ptep;
684 unsigned int shift, level; 782 unsigned int shift, level;
685 int ret; 783 int ret;
784 bool large_enable;
686 785
687 /* used to check for invalidations in progress */ 786 /* used to check for invalidations in progress */
688 mmu_seq = kvm->mmu_notifier_seq; 787 mmu_seq = kvm->mmu_notifier_seq;
@@ -732,12 +831,15 @@ int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu,
732 pte = *ptep; 831 pte = *ptep;
733 local_irq_enable(); 832 local_irq_enable();
734 833
834 /* If we're logging dirty pages, always map single pages */
835 large_enable = !(memslot->flags & KVM_MEM_LOG_DIRTY_PAGES);
836
735 /* Get pte level from shift/size */ 837 /* Get pte level from shift/size */
736 if (shift == PUD_SHIFT && 838 if (large_enable && shift == PUD_SHIFT &&
737 (gpa & (PUD_SIZE - PAGE_SIZE)) == 839 (gpa & (PUD_SIZE - PAGE_SIZE)) ==
738 (hva & (PUD_SIZE - PAGE_SIZE))) { 840 (hva & (PUD_SIZE - PAGE_SIZE))) {
739 level = 2; 841 level = 2;
740 } else if (shift == PMD_SHIFT && 842 } else if (large_enable && shift == PMD_SHIFT &&
741 (gpa & (PMD_SIZE - PAGE_SIZE)) == 843 (gpa & (PMD_SIZE - PAGE_SIZE)) ==
742 (hva & (PMD_SIZE - PAGE_SIZE))) { 844 (hva & (PMD_SIZE - PAGE_SIZE))) {
743 level = 1; 845 level = 1;
@@ -857,7 +959,7 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
857 return ret; 959 return ret;
858} 960}
859 961
860/* Called with kvm->lock held */ 962/* Called with kvm->mmu_lock held */
861int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, 963int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
862 unsigned long gfn) 964 unsigned long gfn)
863{ 965{
@@ -872,7 +974,7 @@ int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
872 return 0; 974 return 0;
873} 975}
874 976
875/* Called with kvm->lock held */ 977/* Called with kvm->mmu_lock held */
876int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, 978int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
877 unsigned long gfn) 979 unsigned long gfn)
878{ 980{
@@ -880,18 +982,24 @@ int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
880 unsigned long gpa = gfn << PAGE_SHIFT; 982 unsigned long gpa = gfn << PAGE_SHIFT;
881 unsigned int shift; 983 unsigned int shift;
882 int ref = 0; 984 int ref = 0;
985 unsigned long old, *rmapp;
883 986
884 ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift); 987 ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift);
885 if (ptep && pte_present(*ptep) && pte_young(*ptep)) { 988 if (ptep && pte_present(*ptep) && pte_young(*ptep)) {
886 kvmppc_radix_update_pte(kvm, ptep, _PAGE_ACCESSED, 0, 989 old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_ACCESSED, 0,
887 gpa, shift); 990 gpa, shift);
888 /* XXX need to flush tlb here? */ 991 /* XXX need to flush tlb here? */
992 /* Also clear bit in ptes in shadow pgtable for nested guests */
993 rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
994 kvmhv_update_nest_rmap_rc_list(kvm, rmapp, _PAGE_ACCESSED, 0,
995 old & PTE_RPN_MASK,
996 1UL << shift);
889 ref = 1; 997 ref = 1;
890 } 998 }
891 return ref; 999 return ref;
892} 1000}
893 1001
894/* Called with kvm->lock held */ 1002/* Called with kvm->mmu_lock held */
895int kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, 1003int kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
896 unsigned long gfn) 1004 unsigned long gfn)
897{ 1005{
@@ -915,15 +1023,23 @@ static int kvm_radix_test_clear_dirty(struct kvm *kvm,
915 pte_t *ptep; 1023 pte_t *ptep;
916 unsigned int shift; 1024 unsigned int shift;
917 int ret = 0; 1025 int ret = 0;
1026 unsigned long old, *rmapp;
918 1027
919 ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift); 1028 ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift);
920 if (ptep && pte_present(*ptep) && pte_dirty(*ptep)) { 1029 if (ptep && pte_present(*ptep) && pte_dirty(*ptep)) {
921 ret = 1; 1030 ret = 1;
922 if (shift) 1031 if (shift)
923 ret = 1 << (shift - PAGE_SHIFT); 1032 ret = 1 << (shift - PAGE_SHIFT);
924 kvmppc_radix_update_pte(kvm, ptep, _PAGE_DIRTY, 0, 1033 spin_lock(&kvm->mmu_lock);
925 gpa, shift); 1034 old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_DIRTY, 0,
1035 gpa, shift);
926 kvmppc_radix_tlbie_page(kvm, gpa, shift, kvm->arch.lpid); 1036 kvmppc_radix_tlbie_page(kvm, gpa, shift, kvm->arch.lpid);
1037 /* Also clear bit in ptes in shadow pgtable for nested guests */
1038 rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
1039 kvmhv_update_nest_rmap_rc_list(kvm, rmapp, _PAGE_DIRTY, 0,
1040 old & PTE_RPN_MASK,
1041 1UL << shift);
1042 spin_unlock(&kvm->mmu_lock);
927 } 1043 }
928 return ret; 1044 return ret;
929} 1045}
@@ -953,6 +1069,26 @@ long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm,
953 return 0; 1069 return 0;
954} 1070}
955 1071
1072void kvmppc_radix_flush_memslot(struct kvm *kvm,
1073 const struct kvm_memory_slot *memslot)
1074{
1075 unsigned long n;
1076 pte_t *ptep;
1077 unsigned long gpa;
1078 unsigned int shift;
1079
1080 gpa = memslot->base_gfn << PAGE_SHIFT;
1081 spin_lock(&kvm->mmu_lock);
1082 for (n = memslot->npages; n; --n) {
1083 ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift);
1084 if (ptep && pte_present(*ptep))
1085 kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot,
1086 kvm->arch.lpid);
1087 gpa += PAGE_SIZE;
1088 }
1089 spin_unlock(&kvm->mmu_lock);
1090}
1091
956static void add_rmmu_ap_encoding(struct kvm_ppc_rmmu_info *info, 1092static void add_rmmu_ap_encoding(struct kvm_ppc_rmmu_info *info,
957 int psize, int *indexp) 1093 int psize, int *indexp)
958{ 1094{
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index a56f8413758a..5a066fc299e1 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -985,6 +985,10 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
985 kvmppc_set_gpr(vcpu, 3, 0); 985 kvmppc_set_gpr(vcpu, 3, 0);
986 vcpu->arch.hcall_needed = 0; 986 vcpu->arch.hcall_needed = 0;
987 return -EINTR; 987 return -EINTR;
988 } else if (ret == H_TOO_HARD) {
989 kvmppc_set_gpr(vcpu, 3, 0);
990 vcpu->arch.hcall_needed = 0;
991 return RESUME_HOST;
988 } 992 }
989 break; 993 break;
990 case H_TLB_INVALIDATE: 994 case H_TLB_INVALIDATE:
@@ -992,7 +996,11 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
992 if (nesting_enabled(vcpu->kvm)) 996 if (nesting_enabled(vcpu->kvm))
993 ret = kvmhv_do_nested_tlbie(vcpu); 997 ret = kvmhv_do_nested_tlbie(vcpu);
994 break; 998 break;
995 999 case H_COPY_TOFROM_GUEST:
1000 ret = H_FUNCTION;
1001 if (nesting_enabled(vcpu->kvm))
1002 ret = kvmhv_copy_tofrom_guest_nested(vcpu);
1003 break;
996 default: 1004 default:
997 return RESUME_HOST; 1005 return RESUME_HOST;
998 } 1006 }
@@ -1336,7 +1344,7 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,
1336 return r; 1344 return r;
1337} 1345}
1338 1346
1339static int kvmppc_handle_nested_exit(struct kvm_vcpu *vcpu) 1347static int kvmppc_handle_nested_exit(struct kvm_run *run, struct kvm_vcpu *vcpu)
1340{ 1348{
1341 int r; 1349 int r;
1342 int srcu_idx; 1350 int srcu_idx;
@@ -1394,7 +1402,7 @@ static int kvmppc_handle_nested_exit(struct kvm_vcpu *vcpu)
1394 */ 1402 */
1395 case BOOK3S_INTERRUPT_H_DATA_STORAGE: 1403 case BOOK3S_INTERRUPT_H_DATA_STORAGE:
1396 srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); 1404 srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
1397 r = kvmhv_nested_page_fault(vcpu); 1405 r = kvmhv_nested_page_fault(run, vcpu);
1398 srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx); 1406 srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
1399 break; 1407 break;
1400 case BOOK3S_INTERRUPT_H_INST_STORAGE: 1408 case BOOK3S_INTERRUPT_H_INST_STORAGE:
@@ -1404,7 +1412,7 @@ static int kvmppc_handle_nested_exit(struct kvm_vcpu *vcpu)
1404 if (vcpu->arch.shregs.msr & HSRR1_HISI_WRITE) 1412 if (vcpu->arch.shregs.msr & HSRR1_HISI_WRITE)
1405 vcpu->arch.fault_dsisr |= DSISR_ISSTORE; 1413 vcpu->arch.fault_dsisr |= DSISR_ISSTORE;
1406 srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); 1414 srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
1407 r = kvmhv_nested_page_fault(vcpu); 1415 r = kvmhv_nested_page_fault(run, vcpu);
1408 srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx); 1416 srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
1409 break; 1417 break;
1410 1418
@@ -4059,7 +4067,7 @@ int kvmhv_run_single_vcpu(struct kvm_run *kvm_run,
4059 if (!nested) 4067 if (!nested)
4060 r = kvmppc_handle_exit_hv(kvm_run, vcpu, current); 4068 r = kvmppc_handle_exit_hv(kvm_run, vcpu, current);
4061 else 4069 else
4062 r = kvmppc_handle_nested_exit(vcpu); 4070 r = kvmppc_handle_nested_exit(kvm_run, vcpu);
4063 } 4071 }
4064 vcpu->arch.ret = r; 4072 vcpu->arch.ret = r;
4065 4073
@@ -4371,7 +4379,8 @@ static int kvmppc_core_prepare_memory_region_hv(struct kvm *kvm,
4371static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm, 4379static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm,
4372 const struct kvm_userspace_memory_region *mem, 4380 const struct kvm_userspace_memory_region *mem,
4373 const struct kvm_memory_slot *old, 4381 const struct kvm_memory_slot *old,
4374 const struct kvm_memory_slot *new) 4382 const struct kvm_memory_slot *new,
4383 enum kvm_mr_change change)
4375{ 4384{
4376 unsigned long npages = mem->memory_size >> PAGE_SHIFT; 4385 unsigned long npages = mem->memory_size >> PAGE_SHIFT;
4377 4386
@@ -4383,6 +4392,23 @@ static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm,
4383 */ 4392 */
4384 if (npages) 4393 if (npages)
4385 atomic64_inc(&kvm->arch.mmio_update); 4394 atomic64_inc(&kvm->arch.mmio_update);
4395
4396 /*
4397 * For change == KVM_MR_MOVE or KVM_MR_DELETE, higher levels
4398 * have already called kvm_arch_flush_shadow_memslot() to
4399 * flush shadow mappings. For KVM_MR_CREATE we have no
4400 * previous mappings. So the only case to handle is
4401 * KVM_MR_FLAGS_ONLY when the KVM_MEM_LOG_DIRTY_PAGES bit
4402 * has been changed.
4403 * For radix guests, we flush on setting KVM_MEM_LOG_DIRTY_PAGES
4404 * to get rid of any THP PTEs in the partition-scoped page tables
4405 * so we can track dirtiness at the page level; we flush when
4406 * clearing KVM_MEM_LOG_DIRTY_PAGES so that we can go back to
4407 * using THP PTEs.
4408 */
4409 if (change == KVM_MR_FLAGS_ONLY && kvm_is_radix(kvm) &&
4410 ((new->flags ^ old->flags) & KVM_MEM_LOG_DIRTY_PAGES))
4411 kvmppc_radix_flush_memslot(kvm, old);
4386} 4412}
4387 4413
4388/* 4414/*
@@ -4532,12 +4558,15 @@ int kvmppc_switch_mmu_to_hpt(struct kvm *kvm)
4532{ 4558{
4533 if (nesting_enabled(kvm)) 4559 if (nesting_enabled(kvm))
4534 kvmhv_release_all_nested(kvm); 4560 kvmhv_release_all_nested(kvm);
4561 kvmppc_rmap_reset(kvm);
4562 kvm->arch.process_table = 0;
4563 /* Mutual exclusion with kvm_unmap_hva_range etc. */
4564 spin_lock(&kvm->mmu_lock);
4565 kvm->arch.radix = 0;
4566 spin_unlock(&kvm->mmu_lock);
4535 kvmppc_free_radix(kvm); 4567 kvmppc_free_radix(kvm);
4536 kvmppc_update_lpcr(kvm, LPCR_VPM1, 4568 kvmppc_update_lpcr(kvm, LPCR_VPM1,
4537 LPCR_VPM1 | LPCR_UPRT | LPCR_GTSE | LPCR_HR); 4569 LPCR_VPM1 | LPCR_UPRT | LPCR_GTSE | LPCR_HR);
4538 kvmppc_rmap_reset(kvm);
4539 kvm->arch.radix = 0;
4540 kvm->arch.process_table = 0;
4541 return 0; 4570 return 0;
4542} 4571}
4543 4572
@@ -4549,12 +4578,14 @@ int kvmppc_switch_mmu_to_radix(struct kvm *kvm)
4549 err = kvmppc_init_vm_radix(kvm); 4578 err = kvmppc_init_vm_radix(kvm);
4550 if (err) 4579 if (err)
4551 return err; 4580 return err;
4552 4581 kvmppc_rmap_reset(kvm);
4582 /* Mutual exclusion with kvm_unmap_hva_range etc. */
4583 spin_lock(&kvm->mmu_lock);
4584 kvm->arch.radix = 1;
4585 spin_unlock(&kvm->mmu_lock);
4553 kvmppc_free_hpt(&kvm->arch.hpt); 4586 kvmppc_free_hpt(&kvm->arch.hpt);
4554 kvmppc_update_lpcr(kvm, LPCR_UPRT | LPCR_GTSE | LPCR_HR, 4587 kvmppc_update_lpcr(kvm, LPCR_UPRT | LPCR_GTSE | LPCR_HR,
4555 LPCR_VPM1 | LPCR_UPRT | LPCR_GTSE | LPCR_HR); 4588 LPCR_VPM1 | LPCR_UPRT | LPCR_GTSE | LPCR_HR);
4556 kvmppc_rmap_reset(kvm);
4557 kvm->arch.radix = 1;
4558 return 0; 4589 return 0;
4559} 4590}
4560 4591
@@ -5214,6 +5245,44 @@ static int kvmhv_enable_nested(struct kvm *kvm)
5214 return 0; 5245 return 0;
5215} 5246}
5216 5247
5248static int kvmhv_load_from_eaddr(struct kvm_vcpu *vcpu, ulong *eaddr, void *ptr,
5249 int size)
5250{
5251 int rc = -EINVAL;
5252
5253 if (kvmhv_vcpu_is_radix(vcpu)) {
5254 rc = kvmhv_copy_from_guest_radix(vcpu, *eaddr, ptr, size);
5255
5256 if (rc > 0)
5257 rc = -EINVAL;
5258 }
5259
5260 /* For now quadrants are the only way to access nested guest memory */
5261 if (rc && vcpu->arch.nested)
5262 rc = -EAGAIN;
5263
5264 return rc;
5265}
5266
5267static int kvmhv_store_to_eaddr(struct kvm_vcpu *vcpu, ulong *eaddr, void *ptr,
5268 int size)
5269{
5270 int rc = -EINVAL;
5271
5272 if (kvmhv_vcpu_is_radix(vcpu)) {
5273 rc = kvmhv_copy_to_guest_radix(vcpu, *eaddr, ptr, size);
5274
5275 if (rc > 0)
5276 rc = -EINVAL;
5277 }
5278
5279 /* For now quadrants are the only way to access nested guest memory */
5280 if (rc && vcpu->arch.nested)
5281 rc = -EAGAIN;
5282
5283 return rc;
5284}
5285
5217static struct kvmppc_ops kvm_ops_hv = { 5286static struct kvmppc_ops kvm_ops_hv = {
5218 .get_sregs = kvm_arch_vcpu_ioctl_get_sregs_hv, 5287 .get_sregs = kvm_arch_vcpu_ioctl_get_sregs_hv,
5219 .set_sregs = kvm_arch_vcpu_ioctl_set_sregs_hv, 5288 .set_sregs = kvm_arch_vcpu_ioctl_set_sregs_hv,
@@ -5254,6 +5323,8 @@ static struct kvmppc_ops kvm_ops_hv = {
5254 .get_rmmu_info = kvmhv_get_rmmu_info, 5323 .get_rmmu_info = kvmhv_get_rmmu_info,
5255 .set_smt_mode = kvmhv_set_smt_mode, 5324 .set_smt_mode = kvmhv_set_smt_mode,
5256 .enable_nested = kvmhv_enable_nested, 5325 .enable_nested = kvmhv_enable_nested,
5326 .load_from_eaddr = kvmhv_load_from_eaddr,
5327 .store_to_eaddr = kvmhv_store_to_eaddr,
5257}; 5328};
5258 5329
5259static int kvm_init_subcore_bitmap(void) 5330static int kvm_init_subcore_bitmap(void)
diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c
index 401d2ecbebc5..735e0ac6f5b2 100644
--- a/arch/powerpc/kvm/book3s_hv_nested.c
+++ b/arch/powerpc/kvm/book3s_hv_nested.c
@@ -195,6 +195,26 @@ void kvmhv_restore_hv_return_state(struct kvm_vcpu *vcpu,
195 vcpu->arch.ppr = hr->ppr; 195 vcpu->arch.ppr = hr->ppr;
196} 196}
197 197
198static void kvmhv_nested_mmio_needed(struct kvm_vcpu *vcpu, u64 regs_ptr)
199{
200 /* No need to reflect the page fault to L1, we've handled it */
201 vcpu->arch.trap = 0;
202
203 /*
204 * Since the L2 gprs have already been written back into L1 memory when
205 * we complete the mmio, store the L1 memory location of the L2 gpr
206 * being loaded into by the mmio so that the loaded value can be
207 * written there in kvmppc_complete_mmio_load()
208 */
209 if (((vcpu->arch.io_gpr & KVM_MMIO_REG_EXT_MASK) == KVM_MMIO_REG_GPR)
210 && (vcpu->mmio_is_write == 0)) {
211 vcpu->arch.nested_io_gpr = (gpa_t) regs_ptr +
212 offsetof(struct pt_regs,
213 gpr[vcpu->arch.io_gpr]);
214 vcpu->arch.io_gpr = KVM_MMIO_REG_NESTED_GPR;
215 }
216}
217
198long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu) 218long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu)
199{ 219{
200 long int err, r; 220 long int err, r;
@@ -316,6 +336,11 @@ long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu)
316 if (r == -EINTR) 336 if (r == -EINTR)
317 return H_INTERRUPT; 337 return H_INTERRUPT;
318 338
339 if (vcpu->mmio_needed) {
340 kvmhv_nested_mmio_needed(vcpu, regs_ptr);
341 return H_TOO_HARD;
342 }
343
319 return vcpu->arch.trap; 344 return vcpu->arch.trap;
320} 345}
321 346
@@ -437,6 +462,81 @@ long kvmhv_set_partition_table(struct kvm_vcpu *vcpu)
437} 462}
438 463
439/* 464/*
465 * Handle the H_COPY_TOFROM_GUEST hcall.
466 * r4 = L1 lpid of nested guest
467 * r5 = pid
468 * r6 = eaddr to access
469 * r7 = to buffer (L1 gpa)
470 * r8 = from buffer (L1 gpa)
471 * r9 = n bytes to copy
472 */
473long kvmhv_copy_tofrom_guest_nested(struct kvm_vcpu *vcpu)
474{
475 struct kvm_nested_guest *gp;
476 int l1_lpid = kvmppc_get_gpr(vcpu, 4);
477 int pid = kvmppc_get_gpr(vcpu, 5);
478 gva_t eaddr = kvmppc_get_gpr(vcpu, 6);
479 gpa_t gp_to = (gpa_t) kvmppc_get_gpr(vcpu, 7);
480 gpa_t gp_from = (gpa_t) kvmppc_get_gpr(vcpu, 8);
481 void *buf;
482 unsigned long n = kvmppc_get_gpr(vcpu, 9);
483 bool is_load = !!gp_to;
484 long rc;
485
486 if (gp_to && gp_from) /* One must be NULL to determine the direction */
487 return H_PARAMETER;
488
489 if (eaddr & (0xFFFUL << 52))
490 return H_PARAMETER;
491
492 buf = kzalloc(n, GFP_KERNEL);
493 if (!buf)
494 return H_NO_MEM;
495
496 gp = kvmhv_get_nested(vcpu->kvm, l1_lpid, false);
497 if (!gp) {
498 rc = H_PARAMETER;
499 goto out_free;
500 }
501
502 mutex_lock(&gp->tlb_lock);
503
504 if (is_load) {
505 /* Load from the nested guest into our buffer */
506 rc = __kvmhv_copy_tofrom_guest_radix(gp->shadow_lpid, pid,
507 eaddr, buf, NULL, n);
508 if (rc)
509 goto not_found;
510
511 /* Write what was loaded into our buffer back to the L1 guest */
512 rc = kvm_vcpu_write_guest(vcpu, gp_to, buf, n);
513 if (rc)
514 goto not_found;
515 } else {
516 /* Load the data to be stored from the L1 guest into our buf */
517 rc = kvm_vcpu_read_guest(vcpu, gp_from, buf, n);
518 if (rc)
519 goto not_found;
520
521 /* Store from our buffer into the nested guest */
522 rc = __kvmhv_copy_tofrom_guest_radix(gp->shadow_lpid, pid,
523 eaddr, NULL, buf, n);
524 if (rc)
525 goto not_found;
526 }
527
528out_unlock:
529 mutex_unlock(&gp->tlb_lock);
530 kvmhv_put_nested(gp);
531out_free:
532 kfree(buf);
533 return rc;
534not_found:
535 rc = H_NOT_FOUND;
536 goto out_unlock;
537}
538
539/*
440 * Reload the partition table entry for a guest. 540 * Reload the partition table entry for a guest.
441 * Caller must hold gp->tlb_lock. 541 * Caller must hold gp->tlb_lock.
442 */ 542 */
@@ -480,6 +580,7 @@ struct kvm_nested_guest *kvmhv_alloc_nested(struct kvm *kvm, unsigned int lpid)
480 if (shadow_lpid < 0) 580 if (shadow_lpid < 0)
481 goto out_free2; 581 goto out_free2;
482 gp->shadow_lpid = shadow_lpid; 582 gp->shadow_lpid = shadow_lpid;
583 gp->radix = 1;
483 584
484 memset(gp->prev_cpu, -1, sizeof(gp->prev_cpu)); 585 memset(gp->prev_cpu, -1, sizeof(gp->prev_cpu));
485 586
@@ -687,6 +788,57 @@ void kvmhv_insert_nest_rmap(struct kvm *kvm, unsigned long *rmapp,
687 *n_rmap = NULL; 788 *n_rmap = NULL;
688} 789}
689 790
791static void kvmhv_update_nest_rmap_rc(struct kvm *kvm, u64 n_rmap,
792 unsigned long clr, unsigned long set,
793 unsigned long hpa, unsigned long mask)
794{
795 struct kvm_nested_guest *gp;
796 unsigned long gpa;
797 unsigned int shift, lpid;
798 pte_t *ptep;
799
800 gpa = n_rmap & RMAP_NESTED_GPA_MASK;
801 lpid = (n_rmap & RMAP_NESTED_LPID_MASK) >> RMAP_NESTED_LPID_SHIFT;
802 gp = kvmhv_find_nested(kvm, lpid);
803 if (!gp)
804 return;
805
806 /* Find the pte */
807 ptep = __find_linux_pte(gp->shadow_pgtable, gpa, NULL, &shift);
808 /*
809 * If the pte is present and the pfn is still the same, update the pte.
810 * If the pfn has changed then this is a stale rmap entry, the nested
811 * gpa actually points somewhere else now, and there is nothing to do.
812 * XXX A future optimisation would be to remove the rmap entry here.
813 */
814 if (ptep && pte_present(*ptep) && ((pte_val(*ptep) & mask) == hpa)) {
815 __radix_pte_update(ptep, clr, set);
816 kvmppc_radix_tlbie_page(kvm, gpa, shift, lpid);
817 }
818}
819
820/*
821 * For a given list of rmap entries, update the rc bits in all ptes in shadow
822 * page tables for nested guests which are referenced by the rmap list.
823 */
824void kvmhv_update_nest_rmap_rc_list(struct kvm *kvm, unsigned long *rmapp,
825 unsigned long clr, unsigned long set,
826 unsigned long hpa, unsigned long nbytes)
827{
828 struct llist_node *entry = ((struct llist_head *) rmapp)->first;
829 struct rmap_nested *cursor;
830 unsigned long rmap, mask;
831
832 if ((clr | set) & ~(_PAGE_DIRTY | _PAGE_ACCESSED))
833 return;
834
835 mask = PTE_RPN_MASK & ~(nbytes - 1);
836 hpa &= mask;
837
838 for_each_nest_rmap_safe(cursor, entry, &rmap)
839 kvmhv_update_nest_rmap_rc(kvm, rmap, clr, set, hpa, mask);
840}
841
690static void kvmhv_remove_nest_rmap(struct kvm *kvm, u64 n_rmap, 842static void kvmhv_remove_nest_rmap(struct kvm *kvm, u64 n_rmap,
691 unsigned long hpa, unsigned long mask) 843 unsigned long hpa, unsigned long mask)
692{ 844{
@@ -723,7 +875,7 @@ static void kvmhv_remove_nest_rmap_list(struct kvm *kvm, unsigned long *rmapp,
723 875
724/* called with kvm->mmu_lock held */ 876/* called with kvm->mmu_lock held */
725void kvmhv_remove_nest_rmap_range(struct kvm *kvm, 877void kvmhv_remove_nest_rmap_range(struct kvm *kvm,
726 struct kvm_memory_slot *memslot, 878 const struct kvm_memory_slot *memslot,
727 unsigned long gpa, unsigned long hpa, 879 unsigned long gpa, unsigned long hpa,
728 unsigned long nbytes) 880 unsigned long nbytes)
729{ 881{
@@ -1049,7 +1201,7 @@ static long kvmhv_handle_nested_set_rc(struct kvm_vcpu *vcpu,
1049 struct kvm *kvm = vcpu->kvm; 1201 struct kvm *kvm = vcpu->kvm;
1050 bool writing = !!(dsisr & DSISR_ISSTORE); 1202 bool writing = !!(dsisr & DSISR_ISSTORE);
1051 u64 pgflags; 1203 u64 pgflags;
1052 bool ret; 1204 long ret;
1053 1205
1054 /* Are the rc bits set in the L1 partition scoped pte? */ 1206 /* Are the rc bits set in the L1 partition scoped pte? */
1055 pgflags = _PAGE_ACCESSED; 1207 pgflags = _PAGE_ACCESSED;
@@ -1062,16 +1214,22 @@ static long kvmhv_handle_nested_set_rc(struct kvm_vcpu *vcpu,
1062 /* Set the rc bit in the pte of our (L0) pgtable for the L1 guest */ 1214 /* Set the rc bit in the pte of our (L0) pgtable for the L1 guest */
1063 ret = kvmppc_hv_handle_set_rc(kvm, kvm->arch.pgtable, writing, 1215 ret = kvmppc_hv_handle_set_rc(kvm, kvm->arch.pgtable, writing,
1064 gpte.raddr, kvm->arch.lpid); 1216 gpte.raddr, kvm->arch.lpid);
1065 spin_unlock(&kvm->mmu_lock); 1217 if (!ret) {
1066 if (!ret) 1218 ret = -EINVAL;
1067 return -EINVAL; 1219 goto out_unlock;
1220 }
1068 1221
1069 /* Set the rc bit in the pte of the shadow_pgtable for the nest guest */ 1222 /* Set the rc bit in the pte of the shadow_pgtable for the nest guest */
1070 ret = kvmppc_hv_handle_set_rc(kvm, gp->shadow_pgtable, writing, n_gpa, 1223 ret = kvmppc_hv_handle_set_rc(kvm, gp->shadow_pgtable, writing, n_gpa,
1071 gp->shadow_lpid); 1224 gp->shadow_lpid);
1072 if (!ret) 1225 if (!ret)
1073 return -EINVAL; 1226 ret = -EINVAL;
1074 return 0; 1227 else
1228 ret = 0;
1229
1230out_unlock:
1231 spin_unlock(&kvm->mmu_lock);
1232 return ret;
1075} 1233}
1076 1234
1077static inline int kvmppc_radix_level_to_shift(int level) 1235static inline int kvmppc_radix_level_to_shift(int level)
@@ -1099,7 +1257,8 @@ static inline int kvmppc_radix_shift_to_level(int shift)
1099} 1257}
1100 1258
1101/* called with gp->tlb_lock held */ 1259/* called with gp->tlb_lock held */
1102static long int __kvmhv_nested_page_fault(struct kvm_vcpu *vcpu, 1260static long int __kvmhv_nested_page_fault(struct kvm_run *run,
1261 struct kvm_vcpu *vcpu,
1103 struct kvm_nested_guest *gp) 1262 struct kvm_nested_guest *gp)
1104{ 1263{
1105 struct kvm *kvm = vcpu->kvm; 1264 struct kvm *kvm = vcpu->kvm;
@@ -1180,9 +1339,9 @@ static long int __kvmhv_nested_page_fault(struct kvm_vcpu *vcpu,
1180 kvmppc_core_queue_data_storage(vcpu, ea, dsisr); 1339 kvmppc_core_queue_data_storage(vcpu, ea, dsisr);
1181 return RESUME_GUEST; 1340 return RESUME_GUEST;
1182 } 1341 }
1183 /* passthrough of emulated MMIO case... */ 1342
1184 pr_err("emulated MMIO passthrough?\n"); 1343 /* passthrough of emulated MMIO case */
1185 return -EINVAL; 1344 return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea, writing);
1186 } 1345 }
1187 if (memslot->flags & KVM_MEM_READONLY) { 1346 if (memslot->flags & KVM_MEM_READONLY) {
1188 if (writing) { 1347 if (writing) {
@@ -1220,6 +1379,8 @@ static long int __kvmhv_nested_page_fault(struct kvm_vcpu *vcpu,
1220 return ret; 1379 return ret;
1221 shift = kvmppc_radix_level_to_shift(level); 1380 shift = kvmppc_radix_level_to_shift(level);
1222 } 1381 }
1382 /* Align gfn to the start of the page */
1383 gfn = (gpa & ~((1UL << shift) - 1)) >> PAGE_SHIFT;
1223 1384
1224 /* 3. Compute the pte we need to insert for nest_gpa -> host r_addr */ 1385 /* 3. Compute the pte we need to insert for nest_gpa -> host r_addr */
1225 1386
@@ -1227,6 +1388,9 @@ static long int __kvmhv_nested_page_fault(struct kvm_vcpu *vcpu,
1227 perm |= gpte.may_read ? 0UL : _PAGE_READ; 1388 perm |= gpte.may_read ? 0UL : _PAGE_READ;
1228 perm |= gpte.may_write ? 0UL : _PAGE_WRITE; 1389 perm |= gpte.may_write ? 0UL : _PAGE_WRITE;
1229 perm |= gpte.may_execute ? 0UL : _PAGE_EXEC; 1390 perm |= gpte.may_execute ? 0UL : _PAGE_EXEC;
1391 /* Only set accessed/dirty (rc) bits if set in host and l1 guest ptes */
1392 perm |= (gpte.rc & _PAGE_ACCESSED) ? 0UL : _PAGE_ACCESSED;
1393 perm |= ((gpte.rc & _PAGE_DIRTY) && writing) ? 0UL : _PAGE_DIRTY;
1230 pte = __pte(pte_val(pte) & ~perm); 1394 pte = __pte(pte_val(pte) & ~perm);
1231 1395
1232 /* What size pte can we insert? */ 1396 /* What size pte can we insert? */
@@ -1264,13 +1428,13 @@ static long int __kvmhv_nested_page_fault(struct kvm_vcpu *vcpu,
1264 return RESUME_GUEST; 1428 return RESUME_GUEST;
1265} 1429}
1266 1430
1267long int kvmhv_nested_page_fault(struct kvm_vcpu *vcpu) 1431long int kvmhv_nested_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu)
1268{ 1432{
1269 struct kvm_nested_guest *gp = vcpu->arch.nested; 1433 struct kvm_nested_guest *gp = vcpu->arch.nested;
1270 long int ret; 1434 long int ret;
1271 1435
1272 mutex_lock(&gp->tlb_lock); 1436 mutex_lock(&gp->tlb_lock);
1273 ret = __kvmhv_nested_page_fault(vcpu, gp); 1437 ret = __kvmhv_nested_page_fault(run, vcpu, gp);
1274 mutex_unlock(&gp->tlb_lock); 1438 mutex_unlock(&gp->tlb_lock);
1275 return ret; 1439 return ret;
1276} 1440}
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index a67cf1cdeda4..3b3791ed74a6 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -107,7 +107,7 @@ void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev,
107EXPORT_SYMBOL_GPL(kvmppc_add_revmap_chain); 107EXPORT_SYMBOL_GPL(kvmppc_add_revmap_chain);
108 108
109/* Update the dirty bitmap of a memslot */ 109/* Update the dirty bitmap of a memslot */
110void kvmppc_update_dirty_map(struct kvm_memory_slot *memslot, 110void kvmppc_update_dirty_map(const struct kvm_memory_slot *memslot,
111 unsigned long gfn, unsigned long psize) 111 unsigned long gfn, unsigned long psize)
112{ 112{
113 unsigned long npages; 113 unsigned long npages;
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index 4efd65d9e828..811a3c2fb0e9 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -587,6 +587,7 @@ void kvmppc_set_pvr_pr(struct kvm_vcpu *vcpu, u32 pvr)
587 case PVR_POWER8: 587 case PVR_POWER8:
588 case PVR_POWER8E: 588 case PVR_POWER8E:
589 case PVR_POWER8NVL: 589 case PVR_POWER8NVL:
590 case PVR_POWER9:
590 vcpu->arch.hflags |= BOOK3S_HFLAG_MULTI_PGSIZE | 591 vcpu->arch.hflags |= BOOK3S_HFLAG_MULTI_PGSIZE |
591 BOOK3S_HFLAG_NEW_TLBIE; 592 BOOK3S_HFLAG_NEW_TLBIE;
592 break; 593 break;
@@ -1913,7 +1914,8 @@ static int kvmppc_core_prepare_memory_region_pr(struct kvm *kvm,
1913static void kvmppc_core_commit_memory_region_pr(struct kvm *kvm, 1914static void kvmppc_core_commit_memory_region_pr(struct kvm *kvm,
1914 const struct kvm_userspace_memory_region *mem, 1915 const struct kvm_userspace_memory_region *mem,
1915 const struct kvm_memory_slot *old, 1916 const struct kvm_memory_slot *old,
1916 const struct kvm_memory_slot *new) 1917 const struct kvm_memory_slot *new,
1918 enum kvm_mr_change change)
1917{ 1919{
1918 return; 1920 return;
1919} 1921}
diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c
index b0b2bfc2ff51..f27ee57ab46e 100644
--- a/arch/powerpc/kvm/book3s_xics.c
+++ b/arch/powerpc/kvm/book3s_xics.c
@@ -1015,17 +1015,7 @@ static int xics_debug_show(struct seq_file *m, void *private)
1015 return 0; 1015 return 0;
1016} 1016}
1017 1017
1018static int xics_debug_open(struct inode *inode, struct file *file) 1018DEFINE_SHOW_ATTRIBUTE(xics_debug);
1019{
1020 return single_open(file, xics_debug_show, inode->i_private);
1021}
1022
1023static const struct file_operations xics_debug_fops = {
1024 .open = xics_debug_open,
1025 .read = seq_read,
1026 .llseek = seq_lseek,
1027 .release = single_release,
1028};
1029 1019
1030static void xics_debugfs_init(struct kvmppc_xics *xics) 1020static void xics_debugfs_init(struct kvmppc_xics *xics)
1031{ 1021{
diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c
index ad4a370703d3..f78d002f0fe0 100644
--- a/arch/powerpc/kvm/book3s_xive.c
+++ b/arch/powerpc/kvm/book3s_xive.c
@@ -1968,17 +1968,7 @@ static int xive_debug_show(struct seq_file *m, void *private)
1968 return 0; 1968 return 0;
1969} 1969}
1970 1970
1971static int xive_debug_open(struct inode *inode, struct file *file) 1971DEFINE_SHOW_ATTRIBUTE(xive_debug);
1972{
1973 return single_open(file, xive_debug_show, inode->i_private);
1974}
1975
1976static const struct file_operations xive_debug_fops = {
1977 .open = xive_debug_open,
1978 .read = seq_read,
1979 .llseek = seq_lseek,
1980 .release = single_release,
1981};
1982 1972
1983static void xive_debugfs_init(struct kvmppc_xive *xive) 1973static void xive_debugfs_init(struct kvmppc_xive *xive)
1984{ 1974{
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index a9ca016da670..dbec4128bb51 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -1833,7 +1833,8 @@ int kvmppc_core_prepare_memory_region(struct kvm *kvm,
1833void kvmppc_core_commit_memory_region(struct kvm *kvm, 1833void kvmppc_core_commit_memory_region(struct kvm *kvm,
1834 const struct kvm_userspace_memory_region *mem, 1834 const struct kvm_userspace_memory_region *mem,
1835 const struct kvm_memory_slot *old, 1835 const struct kvm_memory_slot *old,
1836 const struct kvm_memory_slot *new) 1836 const struct kvm_memory_slot *new,
1837 enum kvm_mr_change change)
1837{ 1838{
1838} 1839}
1839 1840
diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c
index 8f2985e46f6f..c3f312b2bcb3 100644
--- a/arch/powerpc/kvm/e500_mmu_host.c
+++ b/arch/powerpc/kvm/e500_mmu_host.c
@@ -757,10 +757,11 @@ int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
757 return 0; 757 return 0;
758} 758}
759 759
760void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) 760int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
761{ 761{
762 /* The page will get remapped properly on its next fault */ 762 /* The page will get remapped properly on its next fault */
763 kvm_unmap_hva(kvm, hva); 763 kvm_unmap_hva(kvm, hva);
764 return 0;
764} 765}
765 766
766/*****************************************/ 767/*****************************************/
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 2869a299c4ed..b90a7d154180 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -331,10 +331,17 @@ int kvmppc_st(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr,
331{ 331{
332 ulong mp_pa = vcpu->arch.magic_page_pa & KVM_PAM & PAGE_MASK; 332 ulong mp_pa = vcpu->arch.magic_page_pa & KVM_PAM & PAGE_MASK;
333 struct kvmppc_pte pte; 333 struct kvmppc_pte pte;
334 int r; 334 int r = -EINVAL;
335 335
336 vcpu->stat.st++; 336 vcpu->stat.st++;
337 337
338 if (vcpu->kvm->arch.kvm_ops && vcpu->kvm->arch.kvm_ops->store_to_eaddr)
339 r = vcpu->kvm->arch.kvm_ops->store_to_eaddr(vcpu, eaddr, ptr,
340 size);
341
342 if ((!r) || (r == -EAGAIN))
343 return r;
344
338 r = kvmppc_xlate(vcpu, *eaddr, data ? XLATE_DATA : XLATE_INST, 345 r = kvmppc_xlate(vcpu, *eaddr, data ? XLATE_DATA : XLATE_INST,
339 XLATE_WRITE, &pte); 346 XLATE_WRITE, &pte);
340 if (r < 0) 347 if (r < 0)
@@ -367,10 +374,17 @@ int kvmppc_ld(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr,
367{ 374{
368 ulong mp_pa = vcpu->arch.magic_page_pa & KVM_PAM & PAGE_MASK; 375 ulong mp_pa = vcpu->arch.magic_page_pa & KVM_PAM & PAGE_MASK;
369 struct kvmppc_pte pte; 376 struct kvmppc_pte pte;
370 int rc; 377 int rc = -EINVAL;
371 378
372 vcpu->stat.ld++; 379 vcpu->stat.ld++;
373 380
381 if (vcpu->kvm->arch.kvm_ops && vcpu->kvm->arch.kvm_ops->load_from_eaddr)
382 rc = vcpu->kvm->arch.kvm_ops->load_from_eaddr(vcpu, eaddr, ptr,
383 size);
384
385 if ((!rc) || (rc == -EAGAIN))
386 return rc;
387
374 rc = kvmppc_xlate(vcpu, *eaddr, data ? XLATE_DATA : XLATE_INST, 388 rc = kvmppc_xlate(vcpu, *eaddr, data ? XLATE_DATA : XLATE_INST,
375 XLATE_READ, &pte); 389 XLATE_READ, &pte);
376 if (rc) 390 if (rc)
@@ -518,7 +532,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
518 case KVM_CAP_PPC_UNSET_IRQ: 532 case KVM_CAP_PPC_UNSET_IRQ:
519 case KVM_CAP_PPC_IRQ_LEVEL: 533 case KVM_CAP_PPC_IRQ_LEVEL:
520 case KVM_CAP_ENABLE_CAP: 534 case KVM_CAP_ENABLE_CAP:
521 case KVM_CAP_ENABLE_CAP_VM:
522 case KVM_CAP_ONE_REG: 535 case KVM_CAP_ONE_REG:
523 case KVM_CAP_IOEVENTFD: 536 case KVM_CAP_IOEVENTFD:
524 case KVM_CAP_DEVICE_CTRL: 537 case KVM_CAP_DEVICE_CTRL:
@@ -543,8 +556,11 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
543#ifdef CONFIG_PPC_BOOK3S_64 556#ifdef CONFIG_PPC_BOOK3S_64
544 case KVM_CAP_SPAPR_TCE: 557 case KVM_CAP_SPAPR_TCE:
545 case KVM_CAP_SPAPR_TCE_64: 558 case KVM_CAP_SPAPR_TCE_64:
546 /* fallthrough */ 559 r = 1;
560 break;
547 case KVM_CAP_SPAPR_TCE_VFIO: 561 case KVM_CAP_SPAPR_TCE_VFIO:
562 r = !!cpu_has_feature(CPU_FTR_HVMODE);
563 break;
548 case KVM_CAP_PPC_RTAS: 564 case KVM_CAP_PPC_RTAS:
549 case KVM_CAP_PPC_FIXUP_HCALL: 565 case KVM_CAP_PPC_FIXUP_HCALL:
550 case KVM_CAP_PPC_ENABLE_HCALL: 566 case KVM_CAP_PPC_ENABLE_HCALL:
@@ -696,7 +712,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
696 const struct kvm_memory_slot *new, 712 const struct kvm_memory_slot *new,
697 enum kvm_mr_change change) 713 enum kvm_mr_change change)
698{ 714{
699 kvmppc_core_commit_memory_region(kvm, mem, old, new); 715 kvmppc_core_commit_memory_region(kvm, mem, old, new, change);
700} 716}
701 717
702void kvm_arch_flush_shadow_memslot(struct kvm *kvm, 718void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
@@ -1192,6 +1208,14 @@ static void kvmppc_complete_mmio_load(struct kvm_vcpu *vcpu,
1192 kvmppc_set_vmx_byte(vcpu, gpr); 1208 kvmppc_set_vmx_byte(vcpu, gpr);
1193 break; 1209 break;
1194#endif 1210#endif
1211#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
1212 case KVM_MMIO_REG_NESTED_GPR:
1213 if (kvmppc_need_byteswap(vcpu))
1214 gpr = swab64(gpr);
1215 kvm_vcpu_write_guest(vcpu, vcpu->arch.nested_io_gpr, &gpr,
1216 sizeof(gpr));
1217 break;
1218#endif
1195 default: 1219 default:
1196 BUG(); 1220 BUG();
1197 } 1221 }
@@ -2084,8 +2108,8 @@ int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event,
2084} 2108}
2085 2109
2086 2110
2087static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, 2111int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
2088 struct kvm_enable_cap *cap) 2112 struct kvm_enable_cap *cap)
2089{ 2113{
2090 int r; 2114 int r;
2091 2115
@@ -2273,15 +2297,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
2273 2297
2274 break; 2298 break;
2275 } 2299 }
2276 case KVM_ENABLE_CAP:
2277 {
2278 struct kvm_enable_cap cap;
2279 r = -EFAULT;
2280 if (copy_from_user(&cap, argp, sizeof(cap)))
2281 goto out;
2282 r = kvm_vm_ioctl_enable_cap(kvm, &cap);
2283 break;
2284 }
2285#ifdef CONFIG_SPAPR_TCE_IOMMU 2300#ifdef CONFIG_SPAPR_TCE_IOMMU
2286 case KVM_CREATE_SPAPR_TCE_64: { 2301 case KVM_CREATE_SPAPR_TCE_64: {
2287 struct kvm_create_spapr_tce_64 create_tce_64; 2302 struct kvm_create_spapr_tce_64 create_tce_64;
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 1697e903bbf2..2e6fb1d758c3 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -636,6 +636,7 @@ void bad_page_fault(struct pt_regs *regs, unsigned long address, int sig)
636 switch (TRAP(regs)) { 636 switch (TRAP(regs)) {
637 case 0x300: 637 case 0x300:
638 case 0x380: 638 case 0x380:
639 case 0xe00:
639 printk(KERN_ALERT "Unable to handle kernel paging request for " 640 printk(KERN_ALERT "Unable to handle kernel paging request for "
640 "data at address 0x%08lx\n", regs->dar); 641 "data at address 0x%08lx\n", regs->dar);
641 break; 642 break;
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index fe24150ff666..7f4bc58a53b9 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -11,6 +11,9 @@
11 * Jason J. Herne <jjherne@us.ibm.com> 11 * Jason J. Herne <jjherne@us.ibm.com>
12 */ 12 */
13 13
14#define KMSG_COMPONENT "kvm-s390"
15#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
16
14#include <linux/compiler.h> 17#include <linux/compiler.h>
15#include <linux/err.h> 18#include <linux/err.h>
16#include <linux/fs.h> 19#include <linux/fs.h>
@@ -44,10 +47,6 @@
44#include "kvm-s390.h" 47#include "kvm-s390.h"
45#include "gaccess.h" 48#include "gaccess.h"
46 49
47#define KMSG_COMPONENT "kvm-s390"
48#undef pr_fmt
49#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
50
51#define CREATE_TRACE_POINTS 50#define CREATE_TRACE_POINTS
52#include "trace.h" 51#include "trace.h"
53#include "trace-s390.h" 52#include "trace-s390.h"
@@ -417,19 +416,30 @@ static void kvm_s390_cpu_feat_init(void)
417 416
418int kvm_arch_init(void *opaque) 417int kvm_arch_init(void *opaque)
419{ 418{
419 int rc;
420
420 kvm_s390_dbf = debug_register("kvm-trace", 32, 1, 7 * sizeof(long)); 421 kvm_s390_dbf = debug_register("kvm-trace", 32, 1, 7 * sizeof(long));
421 if (!kvm_s390_dbf) 422 if (!kvm_s390_dbf)
422 return -ENOMEM; 423 return -ENOMEM;
423 424
424 if (debug_register_view(kvm_s390_dbf, &debug_sprintf_view)) { 425 if (debug_register_view(kvm_s390_dbf, &debug_sprintf_view)) {
425 debug_unregister(kvm_s390_dbf); 426 rc = -ENOMEM;
426 return -ENOMEM; 427 goto out_debug_unreg;
427 } 428 }
428 429
429 kvm_s390_cpu_feat_init(); 430 kvm_s390_cpu_feat_init();
430 431
431 /* Register floating interrupt controller interface. */ 432 /* Register floating interrupt controller interface. */
432 return kvm_register_device_ops(&kvm_flic_ops, KVM_DEV_TYPE_FLIC); 433 rc = kvm_register_device_ops(&kvm_flic_ops, KVM_DEV_TYPE_FLIC);
434 if (rc) {
435 pr_err("Failed to register FLIC rc=%d\n", rc);
436 goto out_debug_unreg;
437 }
438 return 0;
439
440out_debug_unreg:
441 debug_unregister(kvm_s390_dbf);
442 return rc;
433} 443}
434 444
435void kvm_arch_exit(void) 445void kvm_arch_exit(void)
@@ -464,7 +474,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
464 case KVM_CAP_S390_CSS_SUPPORT: 474 case KVM_CAP_S390_CSS_SUPPORT:
465 case KVM_CAP_IOEVENTFD: 475 case KVM_CAP_IOEVENTFD:
466 case KVM_CAP_DEVICE_CTRL: 476 case KVM_CAP_DEVICE_CTRL:
467 case KVM_CAP_ENABLE_CAP_VM:
468 case KVM_CAP_S390_IRQCHIP: 477 case KVM_CAP_S390_IRQCHIP:
469 case KVM_CAP_VM_ATTRIBUTES: 478 case KVM_CAP_VM_ATTRIBUTES:
470 case KVM_CAP_MP_STATE: 479 case KVM_CAP_MP_STATE:
@@ -607,7 +616,7 @@ static void icpt_operexc_on_all_vcpus(struct kvm *kvm)
607 } 616 }
608} 617}
609 618
610static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) 619int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
611{ 620{
612 int r; 621 int r;
613 622
@@ -1933,14 +1942,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
1933 r = kvm_s390_inject_vm(kvm, &s390int); 1942 r = kvm_s390_inject_vm(kvm, &s390int);
1934 break; 1943 break;
1935 } 1944 }
1936 case KVM_ENABLE_CAP: {
1937 struct kvm_enable_cap cap;
1938 r = -EFAULT;
1939 if (copy_from_user(&cap, argp, sizeof(cap)))
1940 break;
1941 r = kvm_vm_ioctl_enable_cap(kvm, &cap);
1942 break;
1943 }
1944 case KVM_CREATE_IRQCHIP: { 1945 case KVM_CREATE_IRQCHIP: {
1945 struct kvm_irq_routing_entry routing; 1946 struct kvm_irq_routing_entry routing;
1946 1947
diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c
index 3a0aa83cbd07..9494ca68fd9d 100644
--- a/arch/x86/events/intel/pt.c
+++ b/arch/x86/events/intel/pt.c
@@ -68,6 +68,7 @@ static struct pt_cap_desc {
68 PT_CAP(topa_output, 0, CPUID_ECX, BIT(0)), 68 PT_CAP(topa_output, 0, CPUID_ECX, BIT(0)),
69 PT_CAP(topa_multiple_entries, 0, CPUID_ECX, BIT(1)), 69 PT_CAP(topa_multiple_entries, 0, CPUID_ECX, BIT(1)),
70 PT_CAP(single_range_output, 0, CPUID_ECX, BIT(2)), 70 PT_CAP(single_range_output, 0, CPUID_ECX, BIT(2)),
71 PT_CAP(output_subsys, 0, CPUID_ECX, BIT(3)),
71 PT_CAP(payloads_lip, 0, CPUID_ECX, BIT(31)), 72 PT_CAP(payloads_lip, 0, CPUID_ECX, BIT(31)),
72 PT_CAP(num_address_ranges, 1, CPUID_EAX, 0x3), 73 PT_CAP(num_address_ranges, 1, CPUID_EAX, 0x3),
73 PT_CAP(mtc_periods, 1, CPUID_EAX, 0xffff0000), 74 PT_CAP(mtc_periods, 1, CPUID_EAX, 0xffff0000),
@@ -75,14 +76,21 @@ static struct pt_cap_desc {
75 PT_CAP(psb_periods, 1, CPUID_EBX, 0xffff0000), 76 PT_CAP(psb_periods, 1, CPUID_EBX, 0xffff0000),
76}; 77};
77 78
78static u32 pt_cap_get(enum pt_capabilities cap) 79u32 intel_pt_validate_cap(u32 *caps, enum pt_capabilities capability)
79{ 80{
80 struct pt_cap_desc *cd = &pt_caps[cap]; 81 struct pt_cap_desc *cd = &pt_caps[capability];
81 u32 c = pt_pmu.caps[cd->leaf * PT_CPUID_REGS_NUM + cd->reg]; 82 u32 c = caps[cd->leaf * PT_CPUID_REGS_NUM + cd->reg];
82 unsigned int shift = __ffs(cd->mask); 83 unsigned int shift = __ffs(cd->mask);
83 84
84 return (c & cd->mask) >> shift; 85 return (c & cd->mask) >> shift;
85} 86}
87EXPORT_SYMBOL_GPL(intel_pt_validate_cap);
88
89u32 intel_pt_validate_hw_cap(enum pt_capabilities cap)
90{
91 return intel_pt_validate_cap(pt_pmu.caps, cap);
92}
93EXPORT_SYMBOL_GPL(intel_pt_validate_hw_cap);
86 94
87static ssize_t pt_cap_show(struct device *cdev, 95static ssize_t pt_cap_show(struct device *cdev,
88 struct device_attribute *attr, 96 struct device_attribute *attr,
@@ -92,7 +100,7 @@ static ssize_t pt_cap_show(struct device *cdev,
92 container_of(attr, struct dev_ext_attribute, attr); 100 container_of(attr, struct dev_ext_attribute, attr);
93 enum pt_capabilities cap = (long)ea->var; 101 enum pt_capabilities cap = (long)ea->var;
94 102
95 return snprintf(buf, PAGE_SIZE, "%x\n", pt_cap_get(cap)); 103 return snprintf(buf, PAGE_SIZE, "%x\n", intel_pt_validate_hw_cap(cap));
96} 104}
97 105
98static struct attribute_group pt_cap_group __ro_after_init = { 106static struct attribute_group pt_cap_group __ro_after_init = {
@@ -310,16 +318,16 @@ static bool pt_event_valid(struct perf_event *event)
310 return false; 318 return false;
311 319
312 if (config & RTIT_CTL_CYC_PSB) { 320 if (config & RTIT_CTL_CYC_PSB) {
313 if (!pt_cap_get(PT_CAP_psb_cyc)) 321 if (!intel_pt_validate_hw_cap(PT_CAP_psb_cyc))
314 return false; 322 return false;
315 323
316 allowed = pt_cap_get(PT_CAP_psb_periods); 324 allowed = intel_pt_validate_hw_cap(PT_CAP_psb_periods);
317 requested = (config & RTIT_CTL_PSB_FREQ) >> 325 requested = (config & RTIT_CTL_PSB_FREQ) >>
318 RTIT_CTL_PSB_FREQ_OFFSET; 326 RTIT_CTL_PSB_FREQ_OFFSET;
319 if (requested && (!(allowed & BIT(requested)))) 327 if (requested && (!(allowed & BIT(requested))))
320 return false; 328 return false;
321 329
322 allowed = pt_cap_get(PT_CAP_cycle_thresholds); 330 allowed = intel_pt_validate_hw_cap(PT_CAP_cycle_thresholds);
323 requested = (config & RTIT_CTL_CYC_THRESH) >> 331 requested = (config & RTIT_CTL_CYC_THRESH) >>
324 RTIT_CTL_CYC_THRESH_OFFSET; 332 RTIT_CTL_CYC_THRESH_OFFSET;
325 if (requested && (!(allowed & BIT(requested)))) 333 if (requested && (!(allowed & BIT(requested))))
@@ -334,10 +342,10 @@ static bool pt_event_valid(struct perf_event *event)
334 * Spec says that setting mtc period bits while mtc bit in 342 * Spec says that setting mtc period bits while mtc bit in
335 * CPUID is 0 will #GP, so better safe than sorry. 343 * CPUID is 0 will #GP, so better safe than sorry.
336 */ 344 */
337 if (!pt_cap_get(PT_CAP_mtc)) 345 if (!intel_pt_validate_hw_cap(PT_CAP_mtc))
338 return false; 346 return false;
339 347
340 allowed = pt_cap_get(PT_CAP_mtc_periods); 348 allowed = intel_pt_validate_hw_cap(PT_CAP_mtc_periods);
341 if (!allowed) 349 if (!allowed)
342 return false; 350 return false;
343 351
@@ -349,11 +357,11 @@ static bool pt_event_valid(struct perf_event *event)
349 } 357 }
350 358
351 if (config & RTIT_CTL_PWR_EVT_EN && 359 if (config & RTIT_CTL_PWR_EVT_EN &&
352 !pt_cap_get(PT_CAP_power_event_trace)) 360 !intel_pt_validate_hw_cap(PT_CAP_power_event_trace))
353 return false; 361 return false;
354 362
355 if (config & RTIT_CTL_PTW) { 363 if (config & RTIT_CTL_PTW) {
356 if (!pt_cap_get(PT_CAP_ptwrite)) 364 if (!intel_pt_validate_hw_cap(PT_CAP_ptwrite))
357 return false; 365 return false;
358 366
359 /* FUPonPTW without PTW doesn't make sense */ 367 /* FUPonPTW without PTW doesn't make sense */
@@ -598,7 +606,7 @@ static struct topa *topa_alloc(int cpu, gfp_t gfp)
598 * In case of singe-entry ToPA, always put the self-referencing END 606 * In case of singe-entry ToPA, always put the self-referencing END
599 * link as the 2nd entry in the table 607 * link as the 2nd entry in the table
600 */ 608 */
601 if (!pt_cap_get(PT_CAP_topa_multiple_entries)) { 609 if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries)) {
602 TOPA_ENTRY(topa, 1)->base = topa->phys >> TOPA_SHIFT; 610 TOPA_ENTRY(topa, 1)->base = topa->phys >> TOPA_SHIFT;
603 TOPA_ENTRY(topa, 1)->end = 1; 611 TOPA_ENTRY(topa, 1)->end = 1;
604 } 612 }
@@ -638,7 +646,7 @@ static void topa_insert_table(struct pt_buffer *buf, struct topa *topa)
638 topa->offset = last->offset + last->size; 646 topa->offset = last->offset + last->size;
639 buf->last = topa; 647 buf->last = topa;
640 648
641 if (!pt_cap_get(PT_CAP_topa_multiple_entries)) 649 if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries))
642 return; 650 return;
643 651
644 BUG_ON(last->last != TENTS_PER_PAGE - 1); 652 BUG_ON(last->last != TENTS_PER_PAGE - 1);
@@ -654,7 +662,7 @@ static void topa_insert_table(struct pt_buffer *buf, struct topa *topa)
654static bool topa_table_full(struct topa *topa) 662static bool topa_table_full(struct topa *topa)
655{ 663{
656 /* single-entry ToPA is a special case */ 664 /* single-entry ToPA is a special case */
657 if (!pt_cap_get(PT_CAP_topa_multiple_entries)) 665 if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries))
658 return !!topa->last; 666 return !!topa->last;
659 667
660 return topa->last == TENTS_PER_PAGE - 1; 668 return topa->last == TENTS_PER_PAGE - 1;
@@ -690,7 +698,8 @@ static int topa_insert_pages(struct pt_buffer *buf, gfp_t gfp)
690 698
691 TOPA_ENTRY(topa, -1)->base = page_to_phys(p) >> TOPA_SHIFT; 699 TOPA_ENTRY(topa, -1)->base = page_to_phys(p) >> TOPA_SHIFT;
692 TOPA_ENTRY(topa, -1)->size = order; 700 TOPA_ENTRY(topa, -1)->size = order;
693 if (!buf->snapshot && !pt_cap_get(PT_CAP_topa_multiple_entries)) { 701 if (!buf->snapshot &&
702 !intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries)) {
694 TOPA_ENTRY(topa, -1)->intr = 1; 703 TOPA_ENTRY(topa, -1)->intr = 1;
695 TOPA_ENTRY(topa, -1)->stop = 1; 704 TOPA_ENTRY(topa, -1)->stop = 1;
696 } 705 }
@@ -725,7 +734,7 @@ static void pt_topa_dump(struct pt_buffer *buf)
725 topa->table[i].intr ? 'I' : ' ', 734 topa->table[i].intr ? 'I' : ' ',
726 topa->table[i].stop ? 'S' : ' ', 735 topa->table[i].stop ? 'S' : ' ',
727 *(u64 *)&topa->table[i]); 736 *(u64 *)&topa->table[i]);
728 if ((pt_cap_get(PT_CAP_topa_multiple_entries) && 737 if ((intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries) &&
729 topa->table[i].stop) || 738 topa->table[i].stop) ||
730 topa->table[i].end) 739 topa->table[i].end)
731 break; 740 break;
@@ -828,7 +837,7 @@ static void pt_handle_status(struct pt *pt)
828 * means we are already losing data; need to let the decoder 837 * means we are already losing data; need to let the decoder
829 * know. 838 * know.
830 */ 839 */
831 if (!pt_cap_get(PT_CAP_topa_multiple_entries) || 840 if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries) ||
832 buf->output_off == sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size)) { 841 buf->output_off == sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size)) {
833 perf_aux_output_flag(&pt->handle, 842 perf_aux_output_flag(&pt->handle,
834 PERF_AUX_FLAG_TRUNCATED); 843 PERF_AUX_FLAG_TRUNCATED);
@@ -840,7 +849,8 @@ static void pt_handle_status(struct pt *pt)
840 * Also on single-entry ToPA implementations, interrupt will come 849 * Also on single-entry ToPA implementations, interrupt will come
841 * before the output reaches its output region's boundary. 850 * before the output reaches its output region's boundary.
842 */ 851 */
843 if (!pt_cap_get(PT_CAP_topa_multiple_entries) && !buf->snapshot && 852 if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries) &&
853 !buf->snapshot &&
844 pt_buffer_region_size(buf) - buf->output_off <= TOPA_PMI_MARGIN) { 854 pt_buffer_region_size(buf) - buf->output_off <= TOPA_PMI_MARGIN) {
845 void *head = pt_buffer_region(buf); 855 void *head = pt_buffer_region(buf);
846 856
@@ -931,7 +941,7 @@ static int pt_buffer_reset_markers(struct pt_buffer *buf,
931 941
932 942
933 /* single entry ToPA is handled by marking all regions STOP=1 INT=1 */ 943 /* single entry ToPA is handled by marking all regions STOP=1 INT=1 */
934 if (!pt_cap_get(PT_CAP_topa_multiple_entries)) 944 if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries))
935 return 0; 945 return 0;
936 946
937 /* clear STOP and INT from current entry */ 947 /* clear STOP and INT from current entry */
@@ -1082,7 +1092,7 @@ static int pt_buffer_init_topa(struct pt_buffer *buf, unsigned long nr_pages,
1082 pt_buffer_setup_topa_index(buf); 1092 pt_buffer_setup_topa_index(buf);
1083 1093
1084 /* link last table to the first one, unless we're double buffering */ 1094 /* link last table to the first one, unless we're double buffering */
1085 if (pt_cap_get(PT_CAP_topa_multiple_entries)) { 1095 if (intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries)) {
1086 TOPA_ENTRY(buf->last, -1)->base = buf->first->phys >> TOPA_SHIFT; 1096 TOPA_ENTRY(buf->last, -1)->base = buf->first->phys >> TOPA_SHIFT;
1087 TOPA_ENTRY(buf->last, -1)->end = 1; 1097 TOPA_ENTRY(buf->last, -1)->end = 1;
1088 } 1098 }
@@ -1153,7 +1163,7 @@ static int pt_addr_filters_init(struct perf_event *event)
1153 struct pt_filters *filters; 1163 struct pt_filters *filters;
1154 int node = event->cpu == -1 ? -1 : cpu_to_node(event->cpu); 1164 int node = event->cpu == -1 ? -1 : cpu_to_node(event->cpu);
1155 1165
1156 if (!pt_cap_get(PT_CAP_num_address_ranges)) 1166 if (!intel_pt_validate_hw_cap(PT_CAP_num_address_ranges))
1157 return 0; 1167 return 0;
1158 1168
1159 filters = kzalloc_node(sizeof(struct pt_filters), GFP_KERNEL, node); 1169 filters = kzalloc_node(sizeof(struct pt_filters), GFP_KERNEL, node);
@@ -1202,7 +1212,7 @@ static int pt_event_addr_filters_validate(struct list_head *filters)
1202 return -EINVAL; 1212 return -EINVAL;
1203 } 1213 }
1204 1214
1205 if (++range > pt_cap_get(PT_CAP_num_address_ranges)) 1215 if (++range > intel_pt_validate_hw_cap(PT_CAP_num_address_ranges))
1206 return -EOPNOTSUPP; 1216 return -EOPNOTSUPP;
1207 } 1217 }
1208 1218
@@ -1507,12 +1517,12 @@ static __init int pt_init(void)
1507 if (ret) 1517 if (ret)
1508 return ret; 1518 return ret;
1509 1519
1510 if (!pt_cap_get(PT_CAP_topa_output)) { 1520 if (!intel_pt_validate_hw_cap(PT_CAP_topa_output)) {
1511 pr_warn("ToPA output is not supported on this CPU\n"); 1521 pr_warn("ToPA output is not supported on this CPU\n");
1512 return -ENODEV; 1522 return -ENODEV;
1513 } 1523 }
1514 1524
1515 if (!pt_cap_get(PT_CAP_topa_multiple_entries)) 1525 if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries))
1516 pt_pmu.pmu.capabilities = 1526 pt_pmu.pmu.capabilities =
1517 PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_AUX_SW_DOUBLEBUF; 1527 PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_AUX_SW_DOUBLEBUF;
1518 1528
@@ -1530,7 +1540,7 @@ static __init int pt_init(void)
1530 pt_pmu.pmu.addr_filters_sync = pt_event_addr_filters_sync; 1540 pt_pmu.pmu.addr_filters_sync = pt_event_addr_filters_sync;
1531 pt_pmu.pmu.addr_filters_validate = pt_event_addr_filters_validate; 1541 pt_pmu.pmu.addr_filters_validate = pt_event_addr_filters_validate;
1532 pt_pmu.pmu.nr_addr_filters = 1542 pt_pmu.pmu.nr_addr_filters =
1533 pt_cap_get(PT_CAP_num_address_ranges); 1543 intel_pt_validate_hw_cap(PT_CAP_num_address_ranges);
1534 1544
1535 ret = perf_pmu_register(&pt_pmu.pmu, "intel_pt", -1); 1545 ret = perf_pmu_register(&pt_pmu.pmu, "intel_pt", -1);
1536 1546
diff --git a/arch/x86/events/intel/pt.h b/arch/x86/events/intel/pt.h
index 0eb41d07b79a..269e15a9086c 100644
--- a/arch/x86/events/intel/pt.h
+++ b/arch/x86/events/intel/pt.h
@@ -20,43 +20,6 @@
20#define __INTEL_PT_H__ 20#define __INTEL_PT_H__
21 21
22/* 22/*
23 * PT MSR bit definitions
24 */
25#define RTIT_CTL_TRACEEN BIT(0)
26#define RTIT_CTL_CYCLEACC BIT(1)
27#define RTIT_CTL_OS BIT(2)
28#define RTIT_CTL_USR BIT(3)
29#define RTIT_CTL_PWR_EVT_EN BIT(4)
30#define RTIT_CTL_FUP_ON_PTW BIT(5)
31#define RTIT_CTL_CR3EN BIT(7)
32#define RTIT_CTL_TOPA BIT(8)
33#define RTIT_CTL_MTC_EN BIT(9)
34#define RTIT_CTL_TSC_EN BIT(10)
35#define RTIT_CTL_DISRETC BIT(11)
36#define RTIT_CTL_PTW_EN BIT(12)
37#define RTIT_CTL_BRANCH_EN BIT(13)
38#define RTIT_CTL_MTC_RANGE_OFFSET 14
39#define RTIT_CTL_MTC_RANGE (0x0full << RTIT_CTL_MTC_RANGE_OFFSET)
40#define RTIT_CTL_CYC_THRESH_OFFSET 19
41#define RTIT_CTL_CYC_THRESH (0x0full << RTIT_CTL_CYC_THRESH_OFFSET)
42#define RTIT_CTL_PSB_FREQ_OFFSET 24
43#define RTIT_CTL_PSB_FREQ (0x0full << RTIT_CTL_PSB_FREQ_OFFSET)
44#define RTIT_CTL_ADDR0_OFFSET 32
45#define RTIT_CTL_ADDR0 (0x0full << RTIT_CTL_ADDR0_OFFSET)
46#define RTIT_CTL_ADDR1_OFFSET 36
47#define RTIT_CTL_ADDR1 (0x0full << RTIT_CTL_ADDR1_OFFSET)
48#define RTIT_CTL_ADDR2_OFFSET 40
49#define RTIT_CTL_ADDR2 (0x0full << RTIT_CTL_ADDR2_OFFSET)
50#define RTIT_CTL_ADDR3_OFFSET 44
51#define RTIT_CTL_ADDR3 (0x0full << RTIT_CTL_ADDR3_OFFSET)
52#define RTIT_STATUS_FILTEREN BIT(0)
53#define RTIT_STATUS_CONTEXTEN BIT(1)
54#define RTIT_STATUS_TRIGGEREN BIT(2)
55#define RTIT_STATUS_BUFFOVF BIT(3)
56#define RTIT_STATUS_ERROR BIT(4)
57#define RTIT_STATUS_STOPPED BIT(5)
58
59/*
60 * Single-entry ToPA: when this close to region boundary, switch 23 * Single-entry ToPA: when this close to region boundary, switch
61 * buffers to avoid losing data. 24 * buffers to avoid losing data.
62 */ 25 */
@@ -82,30 +45,9 @@ struct topa_entry {
82 u64 rsvd4 : 16; 45 u64 rsvd4 : 16;
83}; 46};
84 47
85#define PT_CPUID_LEAVES 2
86#define PT_CPUID_REGS_NUM 4 /* number of regsters (eax, ebx, ecx, edx) */
87
88/* TSC to Core Crystal Clock Ratio */ 48/* TSC to Core Crystal Clock Ratio */
89#define CPUID_TSC_LEAF 0x15 49#define CPUID_TSC_LEAF 0x15
90 50
91enum pt_capabilities {
92 PT_CAP_max_subleaf = 0,
93 PT_CAP_cr3_filtering,
94 PT_CAP_psb_cyc,
95 PT_CAP_ip_filtering,
96 PT_CAP_mtc,
97 PT_CAP_ptwrite,
98 PT_CAP_power_event_trace,
99 PT_CAP_topa_output,
100 PT_CAP_topa_multiple_entries,
101 PT_CAP_single_range_output,
102 PT_CAP_payloads_lip,
103 PT_CAP_num_address_ranges,
104 PT_CAP_mtc_periods,
105 PT_CAP_cycle_thresholds,
106 PT_CAP_psb_periods,
107};
108
109struct pt_pmu { 51struct pt_pmu {
110 struct pmu pmu; 52 struct pmu pmu;
111 u32 caps[PT_CPUID_REGS_NUM * PT_CPUID_LEAVES]; 53 u32 caps[PT_CPUID_REGS_NUM * PT_CPUID_LEAVES];
diff --git a/arch/x86/hyperv/nested.c b/arch/x86/hyperv/nested.c
index b8e60cc50461..dd0a843f766d 100644
--- a/arch/x86/hyperv/nested.c
+++ b/arch/x86/hyperv/nested.c
@@ -7,6 +7,7 @@
7 * 7 *
8 * Author : Lan Tianyu <Tianyu.Lan@microsoft.com> 8 * Author : Lan Tianyu <Tianyu.Lan@microsoft.com>
9 */ 9 */
10#define pr_fmt(fmt) "Hyper-V: " fmt
10 11
11 12
12#include <linux/types.h> 13#include <linux/types.h>
@@ -54,3 +55,82 @@ fault:
54 return ret; 55 return ret;
55} 56}
56EXPORT_SYMBOL_GPL(hyperv_flush_guest_mapping); 57EXPORT_SYMBOL_GPL(hyperv_flush_guest_mapping);
58
59int hyperv_fill_flush_guest_mapping_list(
60 struct hv_guest_mapping_flush_list *flush,
61 u64 start_gfn, u64 pages)
62{
63 u64 cur = start_gfn;
64 u64 additional_pages;
65 int gpa_n = 0;
66
67 do {
68 /*
69 * If flush requests exceed max flush count, go back to
70 * flush tlbs without range.
71 */
72 if (gpa_n >= HV_MAX_FLUSH_REP_COUNT)
73 return -ENOSPC;
74
75 additional_pages = min_t(u64, pages, HV_MAX_FLUSH_PAGES) - 1;
76
77 flush->gpa_list[gpa_n].page.additional_pages = additional_pages;
78 flush->gpa_list[gpa_n].page.largepage = false;
79 flush->gpa_list[gpa_n].page.basepfn = cur;
80
81 pages -= additional_pages + 1;
82 cur += additional_pages + 1;
83 gpa_n++;
84 } while (pages > 0);
85
86 return gpa_n;
87}
88EXPORT_SYMBOL_GPL(hyperv_fill_flush_guest_mapping_list);
89
90int hyperv_flush_guest_mapping_range(u64 as,
91 hyperv_fill_flush_list_func fill_flush_list_func, void *data)
92{
93 struct hv_guest_mapping_flush_list **flush_pcpu;
94 struct hv_guest_mapping_flush_list *flush;
95 u64 status = 0;
96 unsigned long flags;
97 int ret = -ENOTSUPP;
98 int gpa_n = 0;
99
100 if (!hv_hypercall_pg || !fill_flush_list_func)
101 goto fault;
102
103 local_irq_save(flags);
104
105 flush_pcpu = (struct hv_guest_mapping_flush_list **)
106 this_cpu_ptr(hyperv_pcpu_input_arg);
107
108 flush = *flush_pcpu;
109 if (unlikely(!flush)) {
110 local_irq_restore(flags);
111 goto fault;
112 }
113
114 flush->address_space = as;
115 flush->flags = 0;
116
117 gpa_n = fill_flush_list_func(flush, data);
118 if (gpa_n < 0) {
119 local_irq_restore(flags);
120 goto fault;
121 }
122
123 status = hv_do_rep_hypercall(HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_LIST,
124 gpa_n, 0, flush, NULL);
125
126 local_irq_restore(flags);
127
128 if (!(status & HV_HYPERCALL_RESULT_MASK))
129 ret = 0;
130 else
131 ret = status;
132fault:
133 trace_hyperv_nested_flush_guest_mapping_range(as, ret);
134 return ret;
135}
136EXPORT_SYMBOL_GPL(hyperv_flush_guest_mapping_range);
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index df8e94e2f7be..6d6122524711 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -281,6 +281,7 @@
281#define X86_FEATURE_CLZERO (13*32+ 0) /* CLZERO instruction */ 281#define X86_FEATURE_CLZERO (13*32+ 0) /* CLZERO instruction */
282#define X86_FEATURE_IRPERF (13*32+ 1) /* Instructions Retired Count */ 282#define X86_FEATURE_IRPERF (13*32+ 1) /* Instructions Retired Count */
283#define X86_FEATURE_XSAVEERPTR (13*32+ 2) /* Always save/restore FP error pointers */ 283#define X86_FEATURE_XSAVEERPTR (13*32+ 2) /* Always save/restore FP error pointers */
284#define X86_FEATURE_WBNOINVD (13*32+ 9) /* WBNOINVD instruction */
284#define X86_FEATURE_AMD_IBPB (13*32+12) /* "" Indirect Branch Prediction Barrier */ 285#define X86_FEATURE_AMD_IBPB (13*32+12) /* "" Indirect Branch Prediction Barrier */
285#define X86_FEATURE_AMD_IBRS (13*32+14) /* "" Indirect Branch Restricted Speculation */ 286#define X86_FEATURE_AMD_IBRS (13*32+14) /* "" Indirect Branch Restricted Speculation */
286#define X86_FEATURE_AMD_STIBP (13*32+15) /* "" Single Thread Indirect Branch Predictors */ 287#define X86_FEATURE_AMD_STIBP (13*32+15) /* "" Single Thread Indirect Branch Predictors */
diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h
index 4139f7650fe5..705dafc2d11a 100644
--- a/arch/x86/include/asm/hyperv-tlfs.h
+++ b/arch/x86/include/asm/hyperv-tlfs.h
@@ -10,6 +10,7 @@
10#define _ASM_X86_HYPERV_TLFS_H 10#define _ASM_X86_HYPERV_TLFS_H
11 11
12#include <linux/types.h> 12#include <linux/types.h>
13#include <asm/page.h>
13 14
14/* 15/*
15 * The below CPUID leaves are present if VersionAndFeatures.HypervisorPresent 16 * The below CPUID leaves are present if VersionAndFeatures.HypervisorPresent
@@ -30,158 +31,150 @@
30/* 31/*
31 * Feature identification. EAX indicates which features are available 32 * Feature identification. EAX indicates which features are available
32 * to the partition based upon the current partition privileges. 33 * to the partition based upon the current partition privileges.
34 * These are HYPERV_CPUID_FEATURES.EAX bits.
33 */ 35 */
34 36
35/* VP Runtime (HV_X64_MSR_VP_RUNTIME) available */ 37/* VP Runtime (HV_X64_MSR_VP_RUNTIME) available */
36#define HV_X64_MSR_VP_RUNTIME_AVAILABLE (1 << 0) 38#define HV_X64_MSR_VP_RUNTIME_AVAILABLE BIT(0)
37/* Partition Reference Counter (HV_X64_MSR_TIME_REF_COUNT) available*/ 39/* Partition Reference Counter (HV_X64_MSR_TIME_REF_COUNT) available*/
38#define HV_MSR_TIME_REF_COUNT_AVAILABLE (1 << 1) 40#define HV_MSR_TIME_REF_COUNT_AVAILABLE BIT(1)
39/* Partition reference TSC MSR is available */
40#define HV_MSR_REFERENCE_TSC_AVAILABLE (1 << 9)
41/* Partition Guest IDLE MSR is available */
42#define HV_X64_MSR_GUEST_IDLE_AVAILABLE (1 << 10)
43
44/* A partition's reference time stamp counter (TSC) page */
45#define HV_X64_MSR_REFERENCE_TSC 0x40000021
46
47/*
48 * There is a single feature flag that signifies if the partition has access
49 * to MSRs with local APIC and TSC frequencies.
50 */
51#define HV_X64_ACCESS_FREQUENCY_MSRS (1 << 11)
52
53/* AccessReenlightenmentControls privilege */
54#define HV_X64_ACCESS_REENLIGHTENMENT BIT(13)
55
56/* 41/*
57 * Basic SynIC MSRs (HV_X64_MSR_SCONTROL through HV_X64_MSR_EOM 42 * Basic SynIC MSRs (HV_X64_MSR_SCONTROL through HV_X64_MSR_EOM
58 * and HV_X64_MSR_SINT0 through HV_X64_MSR_SINT15) available 43 * and HV_X64_MSR_SINT0 through HV_X64_MSR_SINT15) available
59 */ 44 */
60#define HV_X64_MSR_SYNIC_AVAILABLE (1 << 2) 45#define HV_X64_MSR_SYNIC_AVAILABLE BIT(2)
61/* 46/*
62 * Synthetic Timer MSRs (HV_X64_MSR_STIMER0_CONFIG through 47 * Synthetic Timer MSRs (HV_X64_MSR_STIMER0_CONFIG through
63 * HV_X64_MSR_STIMER3_COUNT) available 48 * HV_X64_MSR_STIMER3_COUNT) available
64 */ 49 */
65#define HV_MSR_SYNTIMER_AVAILABLE (1 << 3) 50#define HV_MSR_SYNTIMER_AVAILABLE BIT(3)
66/* 51/*
67 * APIC access MSRs (HV_X64_MSR_EOI, HV_X64_MSR_ICR and HV_X64_MSR_TPR) 52 * APIC access MSRs (HV_X64_MSR_EOI, HV_X64_MSR_ICR and HV_X64_MSR_TPR)
68 * are available 53 * are available
69 */ 54 */
70#define HV_X64_MSR_APIC_ACCESS_AVAILABLE (1 << 4) 55#define HV_X64_MSR_APIC_ACCESS_AVAILABLE BIT(4)
71/* Hypercall MSRs (HV_X64_MSR_GUEST_OS_ID and HV_X64_MSR_HYPERCALL) available*/ 56/* Hypercall MSRs (HV_X64_MSR_GUEST_OS_ID and HV_X64_MSR_HYPERCALL) available*/
72#define HV_X64_MSR_HYPERCALL_AVAILABLE (1 << 5) 57#define HV_X64_MSR_HYPERCALL_AVAILABLE BIT(5)
73/* Access virtual processor index MSR (HV_X64_MSR_VP_INDEX) available*/ 58/* Access virtual processor index MSR (HV_X64_MSR_VP_INDEX) available*/
74#define HV_X64_MSR_VP_INDEX_AVAILABLE (1 << 6) 59#define HV_X64_MSR_VP_INDEX_AVAILABLE BIT(6)
75/* Virtual system reset MSR (HV_X64_MSR_RESET) is available*/ 60/* Virtual system reset MSR (HV_X64_MSR_RESET) is available*/
76#define HV_X64_MSR_RESET_AVAILABLE (1 << 7) 61#define HV_X64_MSR_RESET_AVAILABLE BIT(7)
77 /* 62/*
78 * Access statistics pages MSRs (HV_X64_MSR_STATS_PARTITION_RETAIL_PAGE, 63 * Access statistics pages MSRs (HV_X64_MSR_STATS_PARTITION_RETAIL_PAGE,
79 * HV_X64_MSR_STATS_PARTITION_INTERNAL_PAGE, HV_X64_MSR_STATS_VP_RETAIL_PAGE, 64 * HV_X64_MSR_STATS_PARTITION_INTERNAL_PAGE, HV_X64_MSR_STATS_VP_RETAIL_PAGE,
80 * HV_X64_MSR_STATS_VP_INTERNAL_PAGE) available 65 * HV_X64_MSR_STATS_VP_INTERNAL_PAGE) available
81 */ 66 */
82#define HV_X64_MSR_STAT_PAGES_AVAILABLE (1 << 8) 67#define HV_X64_MSR_STAT_PAGES_AVAILABLE BIT(8)
83 68/* Partition reference TSC MSR is available */
84/* Frequency MSRs available */ 69#define HV_MSR_REFERENCE_TSC_AVAILABLE BIT(9)
85#define HV_FEATURE_FREQUENCY_MSRS_AVAILABLE (1 << 8) 70/* Partition Guest IDLE MSR is available */
86 71#define HV_X64_MSR_GUEST_IDLE_AVAILABLE BIT(10)
87/* Crash MSR available */ 72/*
88#define HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE (1 << 10) 73 * There is a single feature flag that signifies if the partition has access
89 74 * to MSRs with local APIC and TSC frequencies.
90/* stimer Direct Mode is available */ 75 */
91#define HV_STIMER_DIRECT_MODE_AVAILABLE (1 << 19) 76#define HV_X64_ACCESS_FREQUENCY_MSRS BIT(11)
77/* AccessReenlightenmentControls privilege */
78#define HV_X64_ACCESS_REENLIGHTENMENT BIT(13)
92 79
93/* 80/*
94 * Feature identification: EBX indicates which flags were specified at 81 * Feature identification: indicates which flags were specified at partition
95 * partition creation. The format is the same as the partition creation 82 * creation. The format is the same as the partition creation flag structure
96 * flag structure defined in section Partition Creation Flags. 83 * defined in section Partition Creation Flags.
84 * These are HYPERV_CPUID_FEATURES.EBX bits.
97 */ 85 */
98#define HV_X64_CREATE_PARTITIONS (1 << 0) 86#define HV_X64_CREATE_PARTITIONS BIT(0)
99#define HV_X64_ACCESS_PARTITION_ID (1 << 1) 87#define HV_X64_ACCESS_PARTITION_ID BIT(1)
100#define HV_X64_ACCESS_MEMORY_POOL (1 << 2) 88#define HV_X64_ACCESS_MEMORY_POOL BIT(2)
101#define HV_X64_ADJUST_MESSAGE_BUFFERS (1 << 3) 89#define HV_X64_ADJUST_MESSAGE_BUFFERS BIT(3)
102#define HV_X64_POST_MESSAGES (1 << 4) 90#define HV_X64_POST_MESSAGES BIT(4)
103#define HV_X64_SIGNAL_EVENTS (1 << 5) 91#define HV_X64_SIGNAL_EVENTS BIT(5)
104#define HV_X64_CREATE_PORT (1 << 6) 92#define HV_X64_CREATE_PORT BIT(6)
105#define HV_X64_CONNECT_PORT (1 << 7) 93#define HV_X64_CONNECT_PORT BIT(7)
106#define HV_X64_ACCESS_STATS (1 << 8) 94#define HV_X64_ACCESS_STATS BIT(8)
107#define HV_X64_DEBUGGING (1 << 11) 95#define HV_X64_DEBUGGING BIT(11)
108#define HV_X64_CPU_POWER_MANAGEMENT (1 << 12) 96#define HV_X64_CPU_POWER_MANAGEMENT BIT(12)
109#define HV_X64_CONFIGURE_PROFILER (1 << 13)
110 97
111/* 98/*
112 * Feature identification. EDX indicates which miscellaneous features 99 * Feature identification. EDX indicates which miscellaneous features
113 * are available to the partition. 100 * are available to the partition.
101 * These are HYPERV_CPUID_FEATURES.EDX bits.
114 */ 102 */
115/* The MWAIT instruction is available (per section MONITOR / MWAIT) */ 103/* The MWAIT instruction is available (per section MONITOR / MWAIT) */
116#define HV_X64_MWAIT_AVAILABLE (1 << 0) 104#define HV_X64_MWAIT_AVAILABLE BIT(0)
117/* Guest debugging support is available */ 105/* Guest debugging support is available */
118#define HV_X64_GUEST_DEBUGGING_AVAILABLE (1 << 1) 106#define HV_X64_GUEST_DEBUGGING_AVAILABLE BIT(1)
119/* Performance Monitor support is available*/ 107/* Performance Monitor support is available*/
120#define HV_X64_PERF_MONITOR_AVAILABLE (1 << 2) 108#define HV_X64_PERF_MONITOR_AVAILABLE BIT(2)
121/* Support for physical CPU dynamic partitioning events is available*/ 109/* Support for physical CPU dynamic partitioning events is available*/
122#define HV_X64_CPU_DYNAMIC_PARTITIONING_AVAILABLE (1 << 3) 110#define HV_X64_CPU_DYNAMIC_PARTITIONING_AVAILABLE BIT(3)
123/* 111/*
124 * Support for passing hypercall input parameter block via XMM 112 * Support for passing hypercall input parameter block via XMM
125 * registers is available 113 * registers is available
126 */ 114 */
127#define HV_X64_HYPERCALL_PARAMS_XMM_AVAILABLE (1 << 4) 115#define HV_X64_HYPERCALL_PARAMS_XMM_AVAILABLE BIT(4)
128/* Support for a virtual guest idle state is available */ 116/* Support for a virtual guest idle state is available */
129#define HV_X64_GUEST_IDLE_STATE_AVAILABLE (1 << 5) 117#define HV_X64_GUEST_IDLE_STATE_AVAILABLE BIT(5)
130/* Guest crash data handler available */ 118/* Frequency MSRs available */
131#define HV_X64_GUEST_CRASH_MSR_AVAILABLE (1 << 10) 119#define HV_FEATURE_FREQUENCY_MSRS_AVAILABLE BIT(8)
120/* Crash MSR available */
121#define HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE BIT(10)
122/* stimer Direct Mode is available */
123#define HV_STIMER_DIRECT_MODE_AVAILABLE BIT(19)
132 124
133/* 125/*
134 * Implementation recommendations. Indicates which behaviors the hypervisor 126 * Implementation recommendations. Indicates which behaviors the hypervisor
135 * recommends the OS implement for optimal performance. 127 * recommends the OS implement for optimal performance.
128 * These are HYPERV_CPUID_ENLIGHTMENT_INFO.EAX bits.
129 */
130/*
131 * Recommend using hypercall for address space switches rather
132 * than MOV to CR3 instruction
136 */ 133 */
137 /* 134#define HV_X64_AS_SWITCH_RECOMMENDED BIT(0)
138 * Recommend using hypercall for address space switches rather
139 * than MOV to CR3 instruction
140 */
141#define HV_X64_AS_SWITCH_RECOMMENDED (1 << 0)
142/* Recommend using hypercall for local TLB flushes rather 135/* Recommend using hypercall for local TLB flushes rather
143 * than INVLPG or MOV to CR3 instructions */ 136 * than INVLPG or MOV to CR3 instructions */
144#define HV_X64_LOCAL_TLB_FLUSH_RECOMMENDED (1 << 1) 137#define HV_X64_LOCAL_TLB_FLUSH_RECOMMENDED BIT(1)
145/* 138/*
146 * Recommend using hypercall for remote TLB flushes rather 139 * Recommend using hypercall for remote TLB flushes rather
147 * than inter-processor interrupts 140 * than inter-processor interrupts
148 */ 141 */
149#define HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED (1 << 2) 142#define HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED BIT(2)
150/* 143/*
151 * Recommend using MSRs for accessing APIC registers 144 * Recommend using MSRs for accessing APIC registers
152 * EOI, ICR and TPR rather than their memory-mapped counterparts 145 * EOI, ICR and TPR rather than their memory-mapped counterparts
153 */ 146 */
154#define HV_X64_APIC_ACCESS_RECOMMENDED (1 << 3) 147#define HV_X64_APIC_ACCESS_RECOMMENDED BIT(3)
155/* Recommend using the hypervisor-provided MSR to initiate a system RESET */ 148/* Recommend using the hypervisor-provided MSR to initiate a system RESET */
156#define HV_X64_SYSTEM_RESET_RECOMMENDED (1 << 4) 149#define HV_X64_SYSTEM_RESET_RECOMMENDED BIT(4)
157/* 150/*
158 * Recommend using relaxed timing for this partition. If used, 151 * Recommend using relaxed timing for this partition. If used,
159 * the VM should disable any watchdog timeouts that rely on the 152 * the VM should disable any watchdog timeouts that rely on the
160 * timely delivery of external interrupts 153 * timely delivery of external interrupts
161 */ 154 */
162#define HV_X64_RELAXED_TIMING_RECOMMENDED (1 << 5) 155#define HV_X64_RELAXED_TIMING_RECOMMENDED BIT(5)
163 156
164/* 157/*
165 * Recommend not using Auto End-Of-Interrupt feature 158 * Recommend not using Auto End-Of-Interrupt feature
166 */ 159 */
167#define HV_DEPRECATING_AEOI_RECOMMENDED (1 << 9) 160#define HV_DEPRECATING_AEOI_RECOMMENDED BIT(9)
168 161
169/* 162/*
170 * Recommend using cluster IPI hypercalls. 163 * Recommend using cluster IPI hypercalls.
171 */ 164 */
172#define HV_X64_CLUSTER_IPI_RECOMMENDED (1 << 10) 165#define HV_X64_CLUSTER_IPI_RECOMMENDED BIT(10)
173 166
174/* Recommend using the newer ExProcessorMasks interface */ 167/* Recommend using the newer ExProcessorMasks interface */
175#define HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED (1 << 11) 168#define HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED BIT(11)
176 169
177/* Recommend using enlightened VMCS */ 170/* Recommend using enlightened VMCS */
178#define HV_X64_ENLIGHTENED_VMCS_RECOMMENDED (1 << 14) 171#define HV_X64_ENLIGHTENED_VMCS_RECOMMENDED BIT(14)
179 172
180/* 173/* Nested features. These are HYPERV_CPUID_NESTED_FEATURES.EAX bits. */
181 * Crash notification flags. 174#define HV_X64_NESTED_GUEST_MAPPING_FLUSH BIT(18)
182 */ 175#define HV_X64_NESTED_MSR_BITMAP BIT(19)
183#define HV_CRASH_CTL_CRASH_NOTIFY_MSG BIT_ULL(62) 176
184#define HV_CRASH_CTL_CRASH_NOTIFY BIT_ULL(63) 177/* Hyper-V specific model specific registers (MSRs) */
185 178
186/* MSR used to identify the guest OS. */ 179/* MSR used to identify the guest OS. */
187#define HV_X64_MSR_GUEST_OS_ID 0x40000000 180#define HV_X64_MSR_GUEST_OS_ID 0x40000000
@@ -201,6 +194,9 @@
201/* MSR used to read the per-partition time reference counter */ 194/* MSR used to read the per-partition time reference counter */
202#define HV_X64_MSR_TIME_REF_COUNT 0x40000020 195#define HV_X64_MSR_TIME_REF_COUNT 0x40000020
203 196
197/* A partition's reference time stamp counter (TSC) page */
198#define HV_X64_MSR_REFERENCE_TSC 0x40000021
199
204/* MSR used to retrieve the TSC frequency */ 200/* MSR used to retrieve the TSC frequency */
205#define HV_X64_MSR_TSC_FREQUENCY 0x40000022 201#define HV_X64_MSR_TSC_FREQUENCY 0x40000022
206 202
@@ -258,9 +254,11 @@
258#define HV_X64_MSR_CRASH_P3 0x40000103 254#define HV_X64_MSR_CRASH_P3 0x40000103
259#define HV_X64_MSR_CRASH_P4 0x40000104 255#define HV_X64_MSR_CRASH_P4 0x40000104
260#define HV_X64_MSR_CRASH_CTL 0x40000105 256#define HV_X64_MSR_CRASH_CTL 0x40000105
261#define HV_X64_MSR_CRASH_CTL_NOTIFY (1ULL << 63) 257
262#define HV_X64_MSR_CRASH_PARAMS \ 258/* TSC emulation after migration */
263 (1 + (HV_X64_MSR_CRASH_P4 - HV_X64_MSR_CRASH_P0)) 259#define HV_X64_MSR_REENLIGHTENMENT_CONTROL 0x40000106
260#define HV_X64_MSR_TSC_EMULATION_CONTROL 0x40000107
261#define HV_X64_MSR_TSC_EMULATION_STATUS 0x40000108
264 262
265/* 263/*
266 * Declare the MSR used to setup pages used to communicate with the hypervisor. 264 * Declare the MSR used to setup pages used to communicate with the hypervisor.
@@ -271,7 +269,7 @@ union hv_x64_msr_hypercall_contents {
271 u64 enable:1; 269 u64 enable:1;
272 u64 reserved:11; 270 u64 reserved:11;
273 u64 guest_physical_address:52; 271 u64 guest_physical_address:52;
274 }; 272 } __packed;
275}; 273};
276 274
277/* 275/*
@@ -283,7 +281,7 @@ struct ms_hyperv_tsc_page {
283 volatile u64 tsc_scale; 281 volatile u64 tsc_scale;
284 volatile s64 tsc_offset; 282 volatile s64 tsc_offset;
285 u64 reserved2[509]; 283 u64 reserved2[509];
286}; 284} __packed;
287 285
288/* 286/*
289 * The guest OS needs to register the guest ID with the hypervisor. 287 * The guest OS needs to register the guest ID with the hypervisor.
@@ -311,39 +309,37 @@ struct ms_hyperv_tsc_page {
311 309
312#define HV_LINUX_VENDOR_ID 0x8100 310#define HV_LINUX_VENDOR_ID 0x8100
313 311
314/* TSC emulation after migration */
315#define HV_X64_MSR_REENLIGHTENMENT_CONTROL 0x40000106
316
317/* Nested features (CPUID 0x4000000A) EAX */
318#define HV_X64_NESTED_GUEST_MAPPING_FLUSH BIT(18)
319#define HV_X64_NESTED_MSR_BITMAP BIT(19)
320
321struct hv_reenlightenment_control { 312struct hv_reenlightenment_control {
322 __u64 vector:8; 313 __u64 vector:8;
323 __u64 reserved1:8; 314 __u64 reserved1:8;
324 __u64 enabled:1; 315 __u64 enabled:1;
325 __u64 reserved2:15; 316 __u64 reserved2:15;
326 __u64 target_vp:32; 317 __u64 target_vp:32;
327}; 318} __packed;
328
329#define HV_X64_MSR_TSC_EMULATION_CONTROL 0x40000107
330#define HV_X64_MSR_TSC_EMULATION_STATUS 0x40000108
331 319
332struct hv_tsc_emulation_control { 320struct hv_tsc_emulation_control {
333 __u64 enabled:1; 321 __u64 enabled:1;
334 __u64 reserved:63; 322 __u64 reserved:63;
335}; 323} __packed;
336 324
337struct hv_tsc_emulation_status { 325struct hv_tsc_emulation_status {
338 __u64 inprogress:1; 326 __u64 inprogress:1;
339 __u64 reserved:63; 327 __u64 reserved:63;
340}; 328} __packed;
341 329
342#define HV_X64_MSR_HYPERCALL_ENABLE 0x00000001 330#define HV_X64_MSR_HYPERCALL_ENABLE 0x00000001
343#define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT 12 331#define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT 12
344#define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_MASK \ 332#define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_MASK \
345 (~((1ull << HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT) - 1)) 333 (~((1ull << HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT) - 1))
346 334
335/*
336 * Crash notification (HV_X64_MSR_CRASH_CTL) flags.
337 */
338#define HV_CRASH_CTL_CRASH_NOTIFY_MSG BIT_ULL(62)
339#define HV_CRASH_CTL_CRASH_NOTIFY BIT_ULL(63)
340#define HV_X64_MSR_CRASH_PARAMS \
341 (1 + (HV_X64_MSR_CRASH_P4 - HV_X64_MSR_CRASH_P0))
342
347#define HV_IPI_LOW_VECTOR 0x10 343#define HV_IPI_LOW_VECTOR 0x10
348#define HV_IPI_HIGH_VECTOR 0xff 344#define HV_IPI_HIGH_VECTOR 0xff
349 345
@@ -358,6 +354,7 @@ struct hv_tsc_emulation_status {
358#define HVCALL_POST_MESSAGE 0x005c 354#define HVCALL_POST_MESSAGE 0x005c
359#define HVCALL_SIGNAL_EVENT 0x005d 355#define HVCALL_SIGNAL_EVENT 0x005d
360#define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE 0x00af 356#define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE 0x00af
357#define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_LIST 0x00b0
361 358
362#define HV_X64_MSR_VP_ASSIST_PAGE_ENABLE 0x00000001 359#define HV_X64_MSR_VP_ASSIST_PAGE_ENABLE 0x00000001
363#define HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT 12 360#define HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT 12
@@ -409,7 +406,7 @@ typedef struct _HV_REFERENCE_TSC_PAGE {
409 __u32 res1; 406 __u32 res1;
410 __u64 tsc_scale; 407 __u64 tsc_scale;
411 __s64 tsc_offset; 408 __s64 tsc_offset;
412} HV_REFERENCE_TSC_PAGE, *PHV_REFERENCE_TSC_PAGE; 409} __packed HV_REFERENCE_TSC_PAGE, *PHV_REFERENCE_TSC_PAGE;
413 410
414/* Define the number of synthetic interrupt sources. */ 411/* Define the number of synthetic interrupt sources. */
415#define HV_SYNIC_SINT_COUNT (16) 412#define HV_SYNIC_SINT_COUNT (16)
@@ -466,7 +463,7 @@ union hv_message_flags {
466 struct { 463 struct {
467 __u8 msg_pending:1; 464 __u8 msg_pending:1;
468 __u8 reserved:7; 465 __u8 reserved:7;
469 }; 466 } __packed;
470}; 467};
471 468
472/* Define port identifier type. */ 469/* Define port identifier type. */
@@ -475,7 +472,7 @@ union hv_port_id {
475 struct { 472 struct {
476 __u32 id:24; 473 __u32 id:24;
477 __u32 reserved:8; 474 __u32 reserved:8;
478 } u; 475 } __packed u;
479}; 476};
480 477
481/* Define synthetic interrupt controller message header. */ 478/* Define synthetic interrupt controller message header. */
@@ -488,7 +485,7 @@ struct hv_message_header {
488 __u64 sender; 485 __u64 sender;
489 union hv_port_id port; 486 union hv_port_id port;
490 }; 487 };
491}; 488} __packed;
492 489
493/* Define synthetic interrupt controller message format. */ 490/* Define synthetic interrupt controller message format. */
494struct hv_message { 491struct hv_message {
@@ -496,12 +493,12 @@ struct hv_message {
496 union { 493 union {
497 __u64 payload[HV_MESSAGE_PAYLOAD_QWORD_COUNT]; 494 __u64 payload[HV_MESSAGE_PAYLOAD_QWORD_COUNT];
498 } u; 495 } u;
499}; 496} __packed;
500 497
501/* Define the synthetic interrupt message page layout. */ 498/* Define the synthetic interrupt message page layout. */
502struct hv_message_page { 499struct hv_message_page {
503 struct hv_message sint_message[HV_SYNIC_SINT_COUNT]; 500 struct hv_message sint_message[HV_SYNIC_SINT_COUNT];
504}; 501} __packed;
505 502
506/* Define timer message payload structure. */ 503/* Define timer message payload structure. */
507struct hv_timer_message_payload { 504struct hv_timer_message_payload {
@@ -509,7 +506,7 @@ struct hv_timer_message_payload {
509 __u32 reserved; 506 __u32 reserved;
510 __u64 expiration_time; /* When the timer expired */ 507 __u64 expiration_time; /* When the timer expired */
511 __u64 delivery_time; /* When the message was delivered */ 508 __u64 delivery_time; /* When the message was delivered */
512}; 509} __packed;
513 510
514/* Define virtual processor assist page structure. */ 511/* Define virtual processor assist page structure. */
515struct hv_vp_assist_page { 512struct hv_vp_assist_page {
@@ -518,8 +515,9 @@ struct hv_vp_assist_page {
518 __u64 vtl_control[2]; 515 __u64 vtl_control[2];
519 __u64 nested_enlightenments_control[2]; 516 __u64 nested_enlightenments_control[2];
520 __u32 enlighten_vmentry; 517 __u32 enlighten_vmentry;
518 __u32 padding;
521 __u64 current_nested_vmcs; 519 __u64 current_nested_vmcs;
522}; 520} __packed;
523 521
524struct hv_enlightened_vmcs { 522struct hv_enlightened_vmcs {
525 u32 revision_id; 523 u32 revision_id;
@@ -533,6 +531,8 @@ struct hv_enlightened_vmcs {
533 u16 host_gs_selector; 531 u16 host_gs_selector;
534 u16 host_tr_selector; 532 u16 host_tr_selector;
535 533
534 u16 padding16_1;
535
536 u64 host_ia32_pat; 536 u64 host_ia32_pat;
537 u64 host_ia32_efer; 537 u64 host_ia32_efer;
538 538
@@ -651,7 +651,7 @@ struct hv_enlightened_vmcs {
651 u64 ept_pointer; 651 u64 ept_pointer;
652 652
653 u16 virtual_processor_id; 653 u16 virtual_processor_id;
654 u16 padding16[3]; 654 u16 padding16_2[3];
655 655
656 u64 padding64_2[5]; 656 u64 padding64_2[5];
657 u64 guest_physical_address; 657 u64 guest_physical_address;
@@ -693,7 +693,7 @@ struct hv_enlightened_vmcs {
693 u32 nested_flush_hypercall:1; 693 u32 nested_flush_hypercall:1;
694 u32 msr_bitmap:1; 694 u32 msr_bitmap:1;
695 u32 reserved:30; 695 u32 reserved:30;
696 } hv_enlightenments_control; 696 } __packed hv_enlightenments_control;
697 u32 hv_vp_id; 697 u32 hv_vp_id;
698 698
699 u64 hv_vm_id; 699 u64 hv_vm_id;
@@ -703,7 +703,7 @@ struct hv_enlightened_vmcs {
703 u64 padding64_5[7]; 703 u64 padding64_5[7];
704 u64 xss_exit_bitmap; 704 u64 xss_exit_bitmap;
705 u64 padding64_6[7]; 705 u64 padding64_6[7];
706}; 706} __packed;
707 707
708#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE 0 708#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE 0
709#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP BIT(0) 709#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP BIT(0)
@@ -725,36 +725,129 @@ struct hv_enlightened_vmcs {
725 725
726#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL 0xFFFF 726#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL 0xFFFF
727 727
728#define HV_STIMER_ENABLE (1ULL << 0) 728/* Define synthetic interrupt controller flag constants. */
729#define HV_STIMER_PERIODIC (1ULL << 1) 729#define HV_EVENT_FLAGS_COUNT (256 * 8)
730#define HV_STIMER_LAZY (1ULL << 2) 730#define HV_EVENT_FLAGS_LONG_COUNT (256 / sizeof(unsigned long))
731#define HV_STIMER_AUTOENABLE (1ULL << 3) 731
732#define HV_STIMER_SINT(config) (__u8)(((config) >> 16) & 0x0F) 732/*
733 * Synthetic timer configuration.
734 */
735union hv_stimer_config {
736 u64 as_uint64;
737 struct {
738 u64 enable:1;
739 u64 periodic:1;
740 u64 lazy:1;
741 u64 auto_enable:1;
742 u64 apic_vector:8;
743 u64 direct_mode:1;
744 u64 reserved_z0:3;
745 u64 sintx:4;
746 u64 reserved_z1:44;
747 } __packed;
748};
749
750
751/* Define the synthetic interrupt controller event flags format. */
752union hv_synic_event_flags {
753 unsigned long flags[HV_EVENT_FLAGS_LONG_COUNT];
754};
755
756/* Define SynIC control register. */
757union hv_synic_scontrol {
758 u64 as_uint64;
759 struct {
760 u64 enable:1;
761 u64 reserved:63;
762 } __packed;
763};
764
765/* Define synthetic interrupt source. */
766union hv_synic_sint {
767 u64 as_uint64;
768 struct {
769 u64 vector:8;
770 u64 reserved1:8;
771 u64 masked:1;
772 u64 auto_eoi:1;
773 u64 reserved2:46;
774 } __packed;
775};
776
777/* Define the format of the SIMP register */
778union hv_synic_simp {
779 u64 as_uint64;
780 struct {
781 u64 simp_enabled:1;
782 u64 preserved:11;
783 u64 base_simp_gpa:52;
784 } __packed;
785};
786
787/* Define the format of the SIEFP register */
788union hv_synic_siefp {
789 u64 as_uint64;
790 struct {
791 u64 siefp_enabled:1;
792 u64 preserved:11;
793 u64 base_siefp_gpa:52;
794 } __packed;
795};
733 796
734struct hv_vpset { 797struct hv_vpset {
735 u64 format; 798 u64 format;
736 u64 valid_bank_mask; 799 u64 valid_bank_mask;
737 u64 bank_contents[]; 800 u64 bank_contents[];
738}; 801} __packed;
739 802
740/* HvCallSendSyntheticClusterIpi hypercall */ 803/* HvCallSendSyntheticClusterIpi hypercall */
741struct hv_send_ipi { 804struct hv_send_ipi {
742 u32 vector; 805 u32 vector;
743 u32 reserved; 806 u32 reserved;
744 u64 cpu_mask; 807 u64 cpu_mask;
745}; 808} __packed;
746 809
747/* HvCallSendSyntheticClusterIpiEx hypercall */ 810/* HvCallSendSyntheticClusterIpiEx hypercall */
748struct hv_send_ipi_ex { 811struct hv_send_ipi_ex {
749 u32 vector; 812 u32 vector;
750 u32 reserved; 813 u32 reserved;
751 struct hv_vpset vp_set; 814 struct hv_vpset vp_set;
752}; 815} __packed;
753 816
754/* HvFlushGuestPhysicalAddressSpace hypercalls */ 817/* HvFlushGuestPhysicalAddressSpace hypercalls */
755struct hv_guest_mapping_flush { 818struct hv_guest_mapping_flush {
756 u64 address_space; 819 u64 address_space;
757 u64 flags; 820 u64 flags;
821} __packed;
822
823/*
824 * HV_MAX_FLUSH_PAGES = "additional_pages" + 1. It's limited
825 * by the bitwidth of "additional_pages" in union hv_gpa_page_range.
826 */
827#define HV_MAX_FLUSH_PAGES (2048)
828
829/* HvFlushGuestPhysicalAddressList hypercall */
830union hv_gpa_page_range {
831 u64 address_space;
832 struct {
833 u64 additional_pages:11;
834 u64 largepage:1;
835 u64 basepfn:52;
836 } page;
837};
838
839/*
840 * All input flush parameters should be in single page. The max flush
841 * count is equal with how many entries of union hv_gpa_page_range can
842 * be populated into the input parameter page.
843 */
844#define HV_MAX_FLUSH_REP_COUNT (PAGE_SIZE - 2 * sizeof(u64) / \
845 sizeof(union hv_gpa_page_range))
846
847struct hv_guest_mapping_flush_list {
848 u64 address_space;
849 u64 flags;
850 union hv_gpa_page_range gpa_list[HV_MAX_FLUSH_REP_COUNT];
758}; 851};
759 852
760/* HvFlushVirtualAddressSpace, HvFlushVirtualAddressList hypercalls */ 853/* HvFlushVirtualAddressSpace, HvFlushVirtualAddressList hypercalls */
@@ -763,7 +856,7 @@ struct hv_tlb_flush {
763 u64 flags; 856 u64 flags;
764 u64 processor_mask; 857 u64 processor_mask;
765 u64 gva_list[]; 858 u64 gva_list[];
766}; 859} __packed;
767 860
768/* HvFlushVirtualAddressSpaceEx, HvFlushVirtualAddressListEx hypercalls */ 861/* HvFlushVirtualAddressSpaceEx, HvFlushVirtualAddressListEx hypercalls */
769struct hv_tlb_flush_ex { 862struct hv_tlb_flush_ex {
@@ -771,6 +864,6 @@ struct hv_tlb_flush_ex {
771 u64 flags; 864 u64 flags;
772 struct hv_vpset hv_vp_set; 865 struct hv_vpset hv_vp_set;
773 u64 gva_list[]; 866 u64 gva_list[];
774}; 867} __packed;
775 868
776#endif 869#endif
diff --git a/arch/x86/include/asm/intel_pt.h b/arch/x86/include/asm/intel_pt.h
index b523f51c5400..634f99b1dc22 100644
--- a/arch/x86/include/asm/intel_pt.h
+++ b/arch/x86/include/asm/intel_pt.h
@@ -2,10 +2,36 @@
2#ifndef _ASM_X86_INTEL_PT_H 2#ifndef _ASM_X86_INTEL_PT_H
3#define _ASM_X86_INTEL_PT_H 3#define _ASM_X86_INTEL_PT_H
4 4
5#define PT_CPUID_LEAVES 2
6#define PT_CPUID_REGS_NUM 4 /* number of regsters (eax, ebx, ecx, edx) */
7
8enum pt_capabilities {
9 PT_CAP_max_subleaf = 0,
10 PT_CAP_cr3_filtering,
11 PT_CAP_psb_cyc,
12 PT_CAP_ip_filtering,
13 PT_CAP_mtc,
14 PT_CAP_ptwrite,
15 PT_CAP_power_event_trace,
16 PT_CAP_topa_output,
17 PT_CAP_topa_multiple_entries,
18 PT_CAP_single_range_output,
19 PT_CAP_output_subsys,
20 PT_CAP_payloads_lip,
21 PT_CAP_num_address_ranges,
22 PT_CAP_mtc_periods,
23 PT_CAP_cycle_thresholds,
24 PT_CAP_psb_periods,
25};
26
5#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL) 27#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL)
6void cpu_emergency_stop_pt(void); 28void cpu_emergency_stop_pt(void);
29extern u32 intel_pt_validate_hw_cap(enum pt_capabilities cap);
30extern u32 intel_pt_validate_cap(u32 *caps, enum pt_capabilities cap);
7#else 31#else
8static inline void cpu_emergency_stop_pt(void) {} 32static inline void cpu_emergency_stop_pt(void) {}
33static inline u32 intel_pt_validate_hw_cap(enum pt_capabilities cap) { return 0; }
34static inline u32 intel_pt_validate_cap(u32 *caps, enum pt_capabilities capability) { return 0; }
9#endif 35#endif
10 36
11#endif /* _ASM_X86_INTEL_PT_H */ 37#endif /* _ASM_X86_INTEL_PT_H */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index fbda5a917c5b..4660ce90de7f 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -439,6 +439,11 @@ struct kvm_mmu {
439 u64 pdptrs[4]; /* pae */ 439 u64 pdptrs[4]; /* pae */
440}; 440};
441 441
442struct kvm_tlb_range {
443 u64 start_gfn;
444 u64 pages;
445};
446
442enum pmc_type { 447enum pmc_type {
443 KVM_PMC_GP = 0, 448 KVM_PMC_GP = 0,
444 KVM_PMC_FIXED, 449 KVM_PMC_FIXED,
@@ -497,7 +502,7 @@ struct kvm_mtrr {
497struct kvm_vcpu_hv_stimer { 502struct kvm_vcpu_hv_stimer {
498 struct hrtimer timer; 503 struct hrtimer timer;
499 int index; 504 int index;
500 u64 config; 505 union hv_stimer_config config;
501 u64 count; 506 u64 count;
502 u64 exp_time; 507 u64 exp_time;
503 struct hv_message msg; 508 struct hv_message msg;
@@ -601,17 +606,16 @@ struct kvm_vcpu_arch {
601 606
602 /* 607 /*
603 * QEMU userspace and the guest each have their own FPU state. 608 * QEMU userspace and the guest each have their own FPU state.
604 * In vcpu_run, we switch between the user and guest FPU contexts. 609 * In vcpu_run, we switch between the user, maintained in the
605 * While running a VCPU, the VCPU thread will have the guest FPU 610 * task_struct struct, and guest FPU contexts. While running a VCPU,
606 * context. 611 * the VCPU thread will have the guest FPU context.
607 * 612 *
608 * Note that while the PKRU state lives inside the fpu registers, 613 * Note that while the PKRU state lives inside the fpu registers,
609 * it is switched out separately at VMENTER and VMEXIT time. The 614 * it is switched out separately at VMENTER and VMEXIT time. The
610 * "guest_fpu" state here contains the guest FPU context, with the 615 * "guest_fpu" state here contains the guest FPU context, with the
611 * host PRKU bits. 616 * host PRKU bits.
612 */ 617 */
613 struct fpu user_fpu; 618 struct fpu *guest_fpu;
614 struct fpu guest_fpu;
615 619
616 u64 xcr0; 620 u64 xcr0;
617 u64 guest_supported_xcr0; 621 u64 guest_supported_xcr0;
@@ -1042,6 +1046,8 @@ struct kvm_x86_ops {
1042 1046
1043 void (*tlb_flush)(struct kvm_vcpu *vcpu, bool invalidate_gpa); 1047 void (*tlb_flush)(struct kvm_vcpu *vcpu, bool invalidate_gpa);
1044 int (*tlb_remote_flush)(struct kvm *kvm); 1048 int (*tlb_remote_flush)(struct kvm *kvm);
1049 int (*tlb_remote_flush_with_range)(struct kvm *kvm,
1050 struct kvm_tlb_range *range);
1045 1051
1046 /* 1052 /*
1047 * Flush any TLB entries associated with the given GVA. 1053 * Flush any TLB entries associated with the given GVA.
@@ -1106,6 +1112,7 @@ struct kvm_x86_ops {
1106 bool (*mpx_supported)(void); 1112 bool (*mpx_supported)(void);
1107 bool (*xsaves_supported)(void); 1113 bool (*xsaves_supported)(void);
1108 bool (*umip_emulated)(void); 1114 bool (*umip_emulated)(void);
1115 bool (*pt_supported)(void);
1109 1116
1110 int (*check_nested_events)(struct kvm_vcpu *vcpu, bool external_intr); 1117 int (*check_nested_events)(struct kvm_vcpu *vcpu, bool external_intr);
1111 void (*request_immediate_exit)(struct kvm_vcpu *vcpu); 1118 void (*request_immediate_exit)(struct kvm_vcpu *vcpu);
@@ -1186,6 +1193,7 @@ struct kvm_x86_ops {
1186 1193
1187 int (*nested_enable_evmcs)(struct kvm_vcpu *vcpu, 1194 int (*nested_enable_evmcs)(struct kvm_vcpu *vcpu,
1188 uint16_t *vmcs_version); 1195 uint16_t *vmcs_version);
1196 uint16_t (*nested_get_evmcs_version)(struct kvm_vcpu *vcpu);
1189}; 1197};
1190 1198
1191struct kvm_arch_async_pf { 1199struct kvm_arch_async_pf {
@@ -1196,6 +1204,7 @@ struct kvm_arch_async_pf {
1196}; 1204};
1197 1205
1198extern struct kvm_x86_ops *kvm_x86_ops; 1206extern struct kvm_x86_ops *kvm_x86_ops;
1207extern struct kmem_cache *x86_fpu_cache;
1199 1208
1200#define __KVM_HAVE_ARCH_VM_ALLOC 1209#define __KVM_HAVE_ARCH_VM_ALLOC
1201static inline struct kvm *kvm_arch_alloc_vm(void) 1210static inline struct kvm *kvm_arch_alloc_vm(void)
@@ -1492,7 +1501,7 @@ asmlinkage void kvm_spurious_fault(void);
1492 "cmpb $0, kvm_rebooting \n\t" \ 1501 "cmpb $0, kvm_rebooting \n\t" \
1493 "jne 668b \n\t" \ 1502 "jne 668b \n\t" \
1494 __ASM_SIZE(push) " $666b \n\t" \ 1503 __ASM_SIZE(push) " $666b \n\t" \
1495 "call kvm_spurious_fault \n\t" \ 1504 "jmp kvm_spurious_fault \n\t" \
1496 ".popsection \n\t" \ 1505 ".popsection \n\t" \
1497 _ASM_EXTABLE(666b, 667b) 1506 _ASM_EXTABLE(666b, 667b)
1498 1507
@@ -1503,7 +1512,7 @@ asmlinkage void kvm_spurious_fault(void);
1503int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end); 1512int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end);
1504int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end); 1513int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end);
1505int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); 1514int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
1506void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); 1515int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
1507int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v); 1516int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v);
1508int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); 1517int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
1509int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu); 1518int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index 1d0a7778e163..cc60e617931c 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -22,6 +22,11 @@ struct ms_hyperv_info {
22 22
23extern struct ms_hyperv_info ms_hyperv; 23extern struct ms_hyperv_info ms_hyperv;
24 24
25
26typedef int (*hyperv_fill_flush_list_func)(
27 struct hv_guest_mapping_flush_list *flush,
28 void *data);
29
25/* 30/*
26 * Generate the guest ID. 31 * Generate the guest ID.
27 */ 32 */
@@ -348,6 +353,11 @@ void set_hv_tscchange_cb(void (*cb)(void));
348void clear_hv_tscchange_cb(void); 353void clear_hv_tscchange_cb(void);
349void hyperv_stop_tsc_emulation(void); 354void hyperv_stop_tsc_emulation(void);
350int hyperv_flush_guest_mapping(u64 as); 355int hyperv_flush_guest_mapping(u64 as);
356int hyperv_flush_guest_mapping_range(u64 as,
357 hyperv_fill_flush_list_func fill_func, void *data);
358int hyperv_fill_flush_guest_mapping_list(
359 struct hv_guest_mapping_flush_list *flush,
360 u64 start_gfn, u64 end_gfn);
351 361
352#ifdef CONFIG_X86_64 362#ifdef CONFIG_X86_64
353void hv_apic_init(void); 363void hv_apic_init(void);
@@ -370,6 +380,11 @@ static inline struct hv_vp_assist_page *hv_get_vp_assist_page(unsigned int cpu)
370 return NULL; 380 return NULL;
371} 381}
372static inline int hyperv_flush_guest_mapping(u64 as) { return -1; } 382static inline int hyperv_flush_guest_mapping(u64 as) { return -1; }
383static inline int hyperv_flush_guest_mapping_range(u64 as,
384 hyperv_fill_flush_list_func fill_func, void *data)
385{
386 return -1;
387}
373#endif /* CONFIG_HYPERV */ 388#endif /* CONFIG_HYPERV */
374 389
375#ifdef CONFIG_HYPERV_TSCPAGE 390#ifdef CONFIG_HYPERV_TSCPAGE
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 9e39cc8bd989..8e40c2446fd1 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -121,7 +121,43 @@
121#define MSR_PEBS_LD_LAT_THRESHOLD 0x000003f6 121#define MSR_PEBS_LD_LAT_THRESHOLD 0x000003f6
122 122
123#define MSR_IA32_RTIT_CTL 0x00000570 123#define MSR_IA32_RTIT_CTL 0x00000570
124#define RTIT_CTL_TRACEEN BIT(0)
125#define RTIT_CTL_CYCLEACC BIT(1)
126#define RTIT_CTL_OS BIT(2)
127#define RTIT_CTL_USR BIT(3)
128#define RTIT_CTL_PWR_EVT_EN BIT(4)
129#define RTIT_CTL_FUP_ON_PTW BIT(5)
130#define RTIT_CTL_FABRIC_EN BIT(6)
131#define RTIT_CTL_CR3EN BIT(7)
132#define RTIT_CTL_TOPA BIT(8)
133#define RTIT_CTL_MTC_EN BIT(9)
134#define RTIT_CTL_TSC_EN BIT(10)
135#define RTIT_CTL_DISRETC BIT(11)
136#define RTIT_CTL_PTW_EN BIT(12)
137#define RTIT_CTL_BRANCH_EN BIT(13)
138#define RTIT_CTL_MTC_RANGE_OFFSET 14
139#define RTIT_CTL_MTC_RANGE (0x0full << RTIT_CTL_MTC_RANGE_OFFSET)
140#define RTIT_CTL_CYC_THRESH_OFFSET 19
141#define RTIT_CTL_CYC_THRESH (0x0full << RTIT_CTL_CYC_THRESH_OFFSET)
142#define RTIT_CTL_PSB_FREQ_OFFSET 24
143#define RTIT_CTL_PSB_FREQ (0x0full << RTIT_CTL_PSB_FREQ_OFFSET)
144#define RTIT_CTL_ADDR0_OFFSET 32
145#define RTIT_CTL_ADDR0 (0x0full << RTIT_CTL_ADDR0_OFFSET)
146#define RTIT_CTL_ADDR1_OFFSET 36
147#define RTIT_CTL_ADDR1 (0x0full << RTIT_CTL_ADDR1_OFFSET)
148#define RTIT_CTL_ADDR2_OFFSET 40
149#define RTIT_CTL_ADDR2 (0x0full << RTIT_CTL_ADDR2_OFFSET)
150#define RTIT_CTL_ADDR3_OFFSET 44
151#define RTIT_CTL_ADDR3 (0x0full << RTIT_CTL_ADDR3_OFFSET)
124#define MSR_IA32_RTIT_STATUS 0x00000571 152#define MSR_IA32_RTIT_STATUS 0x00000571
153#define RTIT_STATUS_FILTEREN BIT(0)
154#define RTIT_STATUS_CONTEXTEN BIT(1)
155#define RTIT_STATUS_TRIGGEREN BIT(2)
156#define RTIT_STATUS_BUFFOVF BIT(3)
157#define RTIT_STATUS_ERROR BIT(4)
158#define RTIT_STATUS_STOPPED BIT(5)
159#define RTIT_STATUS_BYTECNT_OFFSET 32
160#define RTIT_STATUS_BYTECNT (0x1ffffull << RTIT_STATUS_BYTECNT_OFFSET)
125#define MSR_IA32_RTIT_ADDR0_A 0x00000580 161#define MSR_IA32_RTIT_ADDR0_A 0x00000580
126#define MSR_IA32_RTIT_ADDR0_B 0x00000581 162#define MSR_IA32_RTIT_ADDR0_B 0x00000581
127#define MSR_IA32_RTIT_ADDR1_A 0x00000582 163#define MSR_IA32_RTIT_ADDR1_A 0x00000582
@@ -772,6 +808,7 @@
772#define VMX_BASIC_INOUT 0x0040000000000000LLU 808#define VMX_BASIC_INOUT 0x0040000000000000LLU
773 809
774/* MSR_IA32_VMX_MISC bits */ 810/* MSR_IA32_VMX_MISC bits */
811#define MSR_IA32_VMX_MISC_INTEL_PT (1ULL << 14)
775#define MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS (1ULL << 29) 812#define MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS (1ULL << 29)
776#define MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE 0x1F 813#define MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE 0x1F
777/* AMD-V MSRs */ 814/* AMD-V MSRs */
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 93b462e48067..dec9c1e84c78 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -290,11 +290,4 @@ struct __attribute__ ((__packed__)) vmcb {
290 290
291#define SVM_CR0_SELECTIVE_MASK (X86_CR0_TS | X86_CR0_MP) 291#define SVM_CR0_SELECTIVE_MASK (X86_CR0_TS | X86_CR0_MP)
292 292
293#define SVM_VMLOAD ".byte 0x0f, 0x01, 0xda"
294#define SVM_VMRUN ".byte 0x0f, 0x01, 0xd8"
295#define SVM_VMSAVE ".byte 0x0f, 0x01, 0xdb"
296#define SVM_CLGI ".byte 0x0f, 0x01, 0xdd"
297#define SVM_STGI ".byte 0x0f, 0x01, 0xdc"
298#define SVM_INVLPGA ".byte 0x0f, 0x01, 0xdf"
299
300#endif 293#endif
diff --git a/arch/x86/include/asm/trace/hyperv.h b/arch/x86/include/asm/trace/hyperv.h
index 2e6245a023ef..ace464f09681 100644
--- a/arch/x86/include/asm/trace/hyperv.h
+++ b/arch/x86/include/asm/trace/hyperv.h
@@ -42,6 +42,20 @@ TRACE_EVENT(hyperv_nested_flush_guest_mapping,
42 TP_printk("address space %llx ret %d", __entry->as, __entry->ret) 42 TP_printk("address space %llx ret %d", __entry->as, __entry->ret)
43 ); 43 );
44 44
45TRACE_EVENT(hyperv_nested_flush_guest_mapping_range,
46 TP_PROTO(u64 as, int ret),
47 TP_ARGS(as, ret),
48
49 TP_STRUCT__entry(
50 __field(u64, as)
51 __field(int, ret)
52 ),
53 TP_fast_assign(__entry->as = as;
54 __entry->ret = ret;
55 ),
56 TP_printk("address space %llx ret %d", __entry->as, __entry->ret)
57 );
58
45TRACE_EVENT(hyperv_send_ipi_mask, 59TRACE_EVENT(hyperv_send_ipi_mask,
46 TP_PROTO(const struct cpumask *cpus, 60 TP_PROTO(const struct cpumask *cpus,
47 int vector), 61 int vector),
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index ade0f153947d..4e4133e86484 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -77,7 +77,10 @@
77#define SECONDARY_EXEC_ENCLS_EXITING 0x00008000 77#define SECONDARY_EXEC_ENCLS_EXITING 0x00008000
78#define SECONDARY_EXEC_RDSEED_EXITING 0x00010000 78#define SECONDARY_EXEC_RDSEED_EXITING 0x00010000
79#define SECONDARY_EXEC_ENABLE_PML 0x00020000 79#define SECONDARY_EXEC_ENABLE_PML 0x00020000
80#define SECONDARY_EXEC_PT_CONCEAL_VMX 0x00080000
80#define SECONDARY_EXEC_XSAVES 0x00100000 81#define SECONDARY_EXEC_XSAVES 0x00100000
82#define SECONDARY_EXEC_PT_USE_GPA 0x01000000
83#define SECONDARY_EXEC_MODE_BASED_EPT_EXEC 0x00400000
81#define SECONDARY_EXEC_TSC_SCALING 0x02000000 84#define SECONDARY_EXEC_TSC_SCALING 0x02000000
82 85
83#define PIN_BASED_EXT_INTR_MASK 0x00000001 86#define PIN_BASED_EXT_INTR_MASK 0x00000001
@@ -98,6 +101,8 @@
98#define VM_EXIT_LOAD_IA32_EFER 0x00200000 101#define VM_EXIT_LOAD_IA32_EFER 0x00200000
99#define VM_EXIT_SAVE_VMX_PREEMPTION_TIMER 0x00400000 102#define VM_EXIT_SAVE_VMX_PREEMPTION_TIMER 0x00400000
100#define VM_EXIT_CLEAR_BNDCFGS 0x00800000 103#define VM_EXIT_CLEAR_BNDCFGS 0x00800000
104#define VM_EXIT_PT_CONCEAL_PIP 0x01000000
105#define VM_EXIT_CLEAR_IA32_RTIT_CTL 0x02000000
101 106
102#define VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR 0x00036dff 107#define VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR 0x00036dff
103 108
@@ -109,6 +114,8 @@
109#define VM_ENTRY_LOAD_IA32_PAT 0x00004000 114#define VM_ENTRY_LOAD_IA32_PAT 0x00004000
110#define VM_ENTRY_LOAD_IA32_EFER 0x00008000 115#define VM_ENTRY_LOAD_IA32_EFER 0x00008000
111#define VM_ENTRY_LOAD_BNDCFGS 0x00010000 116#define VM_ENTRY_LOAD_BNDCFGS 0x00010000
117#define VM_ENTRY_PT_CONCEAL_PIP 0x00020000
118#define VM_ENTRY_LOAD_IA32_RTIT_CTL 0x00040000
112 119
113#define VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR 0x000011ff 120#define VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR 0x000011ff
114 121
@@ -240,6 +247,8 @@ enum vmcs_field {
240 GUEST_PDPTR3_HIGH = 0x00002811, 247 GUEST_PDPTR3_HIGH = 0x00002811,
241 GUEST_BNDCFGS = 0x00002812, 248 GUEST_BNDCFGS = 0x00002812,
242 GUEST_BNDCFGS_HIGH = 0x00002813, 249 GUEST_BNDCFGS_HIGH = 0x00002813,
250 GUEST_IA32_RTIT_CTL = 0x00002814,
251 GUEST_IA32_RTIT_CTL_HIGH = 0x00002815,
243 HOST_IA32_PAT = 0x00002c00, 252 HOST_IA32_PAT = 0x00002c00,
244 HOST_IA32_PAT_HIGH = 0x00002c01, 253 HOST_IA32_PAT_HIGH = 0x00002c01,
245 HOST_IA32_EFER = 0x00002c02, 254 HOST_IA32_EFER = 0x00002c02,
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 30084ecaa20f..e811d4d1c824 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -1,19 +1,6 @@
1// SPDX-License-Identifier: GPL-2.0-or-later
1/* KVM paravirtual clock driver. A clocksource implementation 2/* KVM paravirtual clock driver. A clocksource implementation
2 Copyright (C) 2008 Glauber de Oliveira Costa, Red Hat Inc. 3 Copyright (C) 2008 Glauber de Oliveira Costa, Red Hat Inc.
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17*/ 4*/
18 5
19#include <linux/clocksource.h> 6#include <linux/clocksource.h>
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index dc4f2fdf5e57..69b3a7c30013 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -16,7 +16,7 @@ kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
16 i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \ 16 i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \
17 hyperv.o page_track.o debugfs.o 17 hyperv.o page_track.o debugfs.o
18 18
19kvm-intel-y += vmx.o pmu_intel.o 19kvm-intel-y += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o vmx/evmcs.o vmx/nested.o
20kvm-amd-y += svm.o pmu_amd.o 20kvm-amd-y += svm.o pmu_amd.o
21 21
22obj-$(CONFIG_KVM) += kvm.o 22obj-$(CONFIG_KVM) += kvm.o
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 7bcfa61375c0..bbffa6c54697 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -67,9 +67,6 @@ u64 kvm_supported_xcr0(void)
67 67
68#define F(x) bit(X86_FEATURE_##x) 68#define F(x) bit(X86_FEATURE_##x)
69 69
70/* For scattered features from cpufeatures.h; we currently expose none */
71#define KF(x) bit(KVM_CPUID_BIT_##x)
72
73int kvm_update_cpuid(struct kvm_vcpu *vcpu) 70int kvm_update_cpuid(struct kvm_vcpu *vcpu)
74{ 71{
75 struct kvm_cpuid_entry2 *best; 72 struct kvm_cpuid_entry2 *best;
@@ -337,6 +334,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
337 unsigned f_mpx = kvm_mpx_supported() ? F(MPX) : 0; 334 unsigned f_mpx = kvm_mpx_supported() ? F(MPX) : 0;
338 unsigned f_xsaves = kvm_x86_ops->xsaves_supported() ? F(XSAVES) : 0; 335 unsigned f_xsaves = kvm_x86_ops->xsaves_supported() ? F(XSAVES) : 0;
339 unsigned f_umip = kvm_x86_ops->umip_emulated() ? F(UMIP) : 0; 336 unsigned f_umip = kvm_x86_ops->umip_emulated() ? F(UMIP) : 0;
337 unsigned f_intel_pt = kvm_x86_ops->pt_supported() ? F(INTEL_PT) : 0;
340 338
341 /* cpuid 1.edx */ 339 /* cpuid 1.edx */
342 const u32 kvm_cpuid_1_edx_x86_features = 340 const u32 kvm_cpuid_1_edx_x86_features =
@@ -380,8 +378,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
380 378
381 /* cpuid 0x80000008.ebx */ 379 /* cpuid 0x80000008.ebx */
382 const u32 kvm_cpuid_8000_0008_ebx_x86_features = 380 const u32 kvm_cpuid_8000_0008_ebx_x86_features =
383 F(AMD_IBPB) | F(AMD_IBRS) | F(AMD_SSBD) | F(VIRT_SSBD) | 381 F(WBNOINVD) | F(AMD_IBPB) | F(AMD_IBRS) | F(AMD_SSBD) | F(VIRT_SSBD) |
384 F(AMD_SSB_NO); 382 F(AMD_SSB_NO) | F(AMD_STIBP);
385 383
386 /* cpuid 0xC0000001.edx */ 384 /* cpuid 0xC0000001.edx */
387 const u32 kvm_cpuid_C000_0001_edx_x86_features = 385 const u32 kvm_cpuid_C000_0001_edx_x86_features =
@@ -395,7 +393,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
395 F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) | 393 F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) |
396 F(ADX) | F(SMAP) | F(AVX512IFMA) | F(AVX512F) | F(AVX512PF) | 394 F(ADX) | F(SMAP) | F(AVX512IFMA) | F(AVX512F) | F(AVX512PF) |
397 F(AVX512ER) | F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(AVX512DQ) | 395 F(AVX512ER) | F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(AVX512DQ) |
398 F(SHA_NI) | F(AVX512BW) | F(AVX512VL); 396 F(SHA_NI) | F(AVX512BW) | F(AVX512VL) | f_intel_pt;
399 397
400 /* cpuid 0xD.1.eax */ 398 /* cpuid 0xD.1.eax */
401 const u32 kvm_cpuid_D_1_eax_x86_features = 399 const u32 kvm_cpuid_D_1_eax_x86_features =
@@ -411,7 +409,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
411 /* cpuid 7.0.edx*/ 409 /* cpuid 7.0.edx*/
412 const u32 kvm_cpuid_7_0_edx_x86_features = 410 const u32 kvm_cpuid_7_0_edx_x86_features =
413 F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) | 411 F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) |
414 F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES); 412 F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP);
415 413
416 /* all calls to cpuid_count() should be made on the same cpu */ 414 /* all calls to cpuid_count() should be made on the same cpu */
417 get_cpu(); 415 get_cpu();
@@ -426,7 +424,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
426 424
427 switch (function) { 425 switch (function) {
428 case 0: 426 case 0:
429 entry->eax = min(entry->eax, (u32)0xd); 427 entry->eax = min(entry->eax, (u32)(f_intel_pt ? 0x14 : 0xd));
430 break; 428 break;
431 case 1: 429 case 1:
432 entry->edx &= kvm_cpuid_1_edx_x86_features; 430 entry->edx &= kvm_cpuid_1_edx_x86_features;
@@ -603,6 +601,23 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
603 } 601 }
604 break; 602 break;
605 } 603 }
604 /* Intel PT */
605 case 0x14: {
606 int t, times = entry->eax;
607
608 if (!f_intel_pt)
609 break;
610
611 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
612 for (t = 1; t <= times; ++t) {
613 if (*nent >= maxnent)
614 goto out;
615 do_cpuid_1_ent(&entry[t], function, t);
616 entry[t].flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
617 ++*nent;
618 }
619 break;
620 }
606 case KVM_CPUID_SIGNATURE: { 621 case KVM_CPUID_SIGNATURE: {
607 static const char signature[12] = "KVMKVMKVM\0\0"; 622 static const char signature[12] = "KVMKVMKVM\0\0";
608 const u32 *sigptr = (const u32 *)signature; 623 const u32 *sigptr = (const u32 *)signature;
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 4e80080f277a..c90a5352d158 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -38,6 +38,9 @@
38 38
39#define KVM_HV_MAX_SPARSE_VCPU_SET_BITS DIV_ROUND_UP(KVM_MAX_VCPUS, 64) 39#define KVM_HV_MAX_SPARSE_VCPU_SET_BITS DIV_ROUND_UP(KVM_MAX_VCPUS, 64)
40 40
41static void stimer_mark_pending(struct kvm_vcpu_hv_stimer *stimer,
42 bool vcpu_kick);
43
41static inline u64 synic_read_sint(struct kvm_vcpu_hv_synic *synic, int sint) 44static inline u64 synic_read_sint(struct kvm_vcpu_hv_synic *synic, int sint)
42{ 45{
43 return atomic64_read(&synic->sint[sint]); 46 return atomic64_read(&synic->sint[sint]);
@@ -158,59 +161,24 @@ static struct kvm_vcpu_hv_synic *synic_get(struct kvm *kvm, u32 vpidx)
158 return (synic->active) ? synic : NULL; 161 return (synic->active) ? synic : NULL;
159} 162}
160 163
161static void synic_clear_sint_msg_pending(struct kvm_vcpu_hv_synic *synic,
162 u32 sint)
163{
164 struct kvm_vcpu *vcpu = synic_to_vcpu(synic);
165 struct page *page;
166 gpa_t gpa;
167 struct hv_message *msg;
168 struct hv_message_page *msg_page;
169
170 gpa = synic->msg_page & PAGE_MASK;
171 page = kvm_vcpu_gfn_to_page(vcpu, gpa >> PAGE_SHIFT);
172 if (is_error_page(page)) {
173 vcpu_err(vcpu, "Hyper-V SynIC can't get msg page, gpa 0x%llx\n",
174 gpa);
175 return;
176 }
177 msg_page = kmap_atomic(page);
178
179 msg = &msg_page->sint_message[sint];
180 msg->header.message_flags.msg_pending = 0;
181
182 kunmap_atomic(msg_page);
183 kvm_release_page_dirty(page);
184 kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
185}
186
187static void kvm_hv_notify_acked_sint(struct kvm_vcpu *vcpu, u32 sint) 164static void kvm_hv_notify_acked_sint(struct kvm_vcpu *vcpu, u32 sint)
188{ 165{
189 struct kvm *kvm = vcpu->kvm; 166 struct kvm *kvm = vcpu->kvm;
190 struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu); 167 struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu);
191 struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu); 168 struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu);
192 struct kvm_vcpu_hv_stimer *stimer; 169 struct kvm_vcpu_hv_stimer *stimer;
193 int gsi, idx, stimers_pending; 170 int gsi, idx;
194 171
195 trace_kvm_hv_notify_acked_sint(vcpu->vcpu_id, sint); 172 trace_kvm_hv_notify_acked_sint(vcpu->vcpu_id, sint);
196 173
197 if (synic->msg_page & HV_SYNIC_SIMP_ENABLE)
198 synic_clear_sint_msg_pending(synic, sint);
199
200 /* Try to deliver pending Hyper-V SynIC timers messages */ 174 /* Try to deliver pending Hyper-V SynIC timers messages */
201 stimers_pending = 0;
202 for (idx = 0; idx < ARRAY_SIZE(hv_vcpu->stimer); idx++) { 175 for (idx = 0; idx < ARRAY_SIZE(hv_vcpu->stimer); idx++) {
203 stimer = &hv_vcpu->stimer[idx]; 176 stimer = &hv_vcpu->stimer[idx];
204 if (stimer->msg_pending && 177 if (stimer->msg_pending && stimer->config.enable &&
205 (stimer->config & HV_STIMER_ENABLE) && 178 !stimer->config.direct_mode &&
206 HV_STIMER_SINT(stimer->config) == sint) { 179 stimer->config.sintx == sint)
207 set_bit(stimer->index, 180 stimer_mark_pending(stimer, false);
208 hv_vcpu->stimer_pending_bitmap);
209 stimers_pending++;
210 }
211 } 181 }
212 if (stimers_pending)
213 kvm_make_request(KVM_REQ_HV_STIMER, vcpu);
214 182
215 idx = srcu_read_lock(&kvm->irq_srcu); 183 idx = srcu_read_lock(&kvm->irq_srcu);
216 gsi = atomic_read(&synic->sint_to_gsi[sint]); 184 gsi = atomic_read(&synic->sint_to_gsi[sint]);
@@ -497,7 +465,7 @@ static int stimer_start(struct kvm_vcpu_hv_stimer *stimer)
497 time_now = get_time_ref_counter(stimer_to_vcpu(stimer)->kvm); 465 time_now = get_time_ref_counter(stimer_to_vcpu(stimer)->kvm);
498 ktime_now = ktime_get(); 466 ktime_now = ktime_get();
499 467
500 if (stimer->config & HV_STIMER_PERIODIC) { 468 if (stimer->config.periodic) {
501 if (stimer->exp_time) { 469 if (stimer->exp_time) {
502 if (time_now >= stimer->exp_time) { 470 if (time_now >= stimer->exp_time) {
503 u64 remainder; 471 u64 remainder;
@@ -546,13 +514,18 @@ static int stimer_start(struct kvm_vcpu_hv_stimer *stimer)
546static int stimer_set_config(struct kvm_vcpu_hv_stimer *stimer, u64 config, 514static int stimer_set_config(struct kvm_vcpu_hv_stimer *stimer, u64 config,
547 bool host) 515 bool host)
548{ 516{
517 union hv_stimer_config new_config = {.as_uint64 = config},
518 old_config = {.as_uint64 = stimer->config.as_uint64};
519
549 trace_kvm_hv_stimer_set_config(stimer_to_vcpu(stimer)->vcpu_id, 520 trace_kvm_hv_stimer_set_config(stimer_to_vcpu(stimer)->vcpu_id,
550 stimer->index, config, host); 521 stimer->index, config, host);
551 522
552 stimer_cleanup(stimer); 523 stimer_cleanup(stimer);
553 if ((stimer->config & HV_STIMER_ENABLE) && HV_STIMER_SINT(config) == 0) 524 if (old_config.enable &&
554 config &= ~HV_STIMER_ENABLE; 525 !new_config.direct_mode && new_config.sintx == 0)
555 stimer->config = config; 526 new_config.enable = 0;
527 stimer->config.as_uint64 = new_config.as_uint64;
528
556 stimer_mark_pending(stimer, false); 529 stimer_mark_pending(stimer, false);
557 return 0; 530 return 0;
558} 531}
@@ -566,16 +539,16 @@ static int stimer_set_count(struct kvm_vcpu_hv_stimer *stimer, u64 count,
566 stimer_cleanup(stimer); 539 stimer_cleanup(stimer);
567 stimer->count = count; 540 stimer->count = count;
568 if (stimer->count == 0) 541 if (stimer->count == 0)
569 stimer->config &= ~HV_STIMER_ENABLE; 542 stimer->config.enable = 0;
570 else if (stimer->config & HV_STIMER_AUTOENABLE) 543 else if (stimer->config.auto_enable)
571 stimer->config |= HV_STIMER_ENABLE; 544 stimer->config.enable = 1;
572 stimer_mark_pending(stimer, false); 545 stimer_mark_pending(stimer, false);
573 return 0; 546 return 0;
574} 547}
575 548
576static int stimer_get_config(struct kvm_vcpu_hv_stimer *stimer, u64 *pconfig) 549static int stimer_get_config(struct kvm_vcpu_hv_stimer *stimer, u64 *pconfig)
577{ 550{
578 *pconfig = stimer->config; 551 *pconfig = stimer->config.as_uint64;
579 return 0; 552 return 0;
580} 553}
581 554
@@ -586,44 +559,60 @@ static int stimer_get_count(struct kvm_vcpu_hv_stimer *stimer, u64 *pcount)
586} 559}
587 560
588static int synic_deliver_msg(struct kvm_vcpu_hv_synic *synic, u32 sint, 561static int synic_deliver_msg(struct kvm_vcpu_hv_synic *synic, u32 sint,
589 struct hv_message *src_msg) 562 struct hv_message *src_msg, bool no_retry)
590{ 563{
591 struct kvm_vcpu *vcpu = synic_to_vcpu(synic); 564 struct kvm_vcpu *vcpu = synic_to_vcpu(synic);
592 struct page *page; 565 int msg_off = offsetof(struct hv_message_page, sint_message[sint]);
593 gpa_t gpa; 566 gfn_t msg_page_gfn;
594 struct hv_message *dst_msg; 567 struct hv_message_header hv_hdr;
595 int r; 568 int r;
596 struct hv_message_page *msg_page;
597 569
598 if (!(synic->msg_page & HV_SYNIC_SIMP_ENABLE)) 570 if (!(synic->msg_page & HV_SYNIC_SIMP_ENABLE))
599 return -ENOENT; 571 return -ENOENT;
600 572
601 gpa = synic->msg_page & PAGE_MASK; 573 msg_page_gfn = synic->msg_page >> PAGE_SHIFT;
602 page = kvm_vcpu_gfn_to_page(vcpu, gpa >> PAGE_SHIFT);
603 if (is_error_page(page))
604 return -EFAULT;
605 574
606 msg_page = kmap_atomic(page); 575 /*
607 dst_msg = &msg_page->sint_message[sint]; 576 * Strictly following the spec-mandated ordering would assume setting
608 if (sync_cmpxchg(&dst_msg->header.message_type, HVMSG_NONE, 577 * .msg_pending before checking .message_type. However, this function
609 src_msg->header.message_type) != HVMSG_NONE) { 578 * is only called in vcpu context so the entire update is atomic from
610 dst_msg->header.message_flags.msg_pending = 1; 579 * guest POV and thus the exact order here doesn't matter.
611 r = -EAGAIN; 580 */
612 } else { 581 r = kvm_vcpu_read_guest_page(vcpu, msg_page_gfn, &hv_hdr.message_type,
613 memcpy(&dst_msg->u.payload, &src_msg->u.payload, 582 msg_off + offsetof(struct hv_message,
614 src_msg->header.payload_size); 583 header.message_type),
615 dst_msg->header.message_type = src_msg->header.message_type; 584 sizeof(hv_hdr.message_type));
616 dst_msg->header.payload_size = src_msg->header.payload_size; 585 if (r < 0)
617 r = synic_set_irq(synic, sint); 586 return r;
618 if (r >= 1) 587
619 r = 0; 588 if (hv_hdr.message_type != HVMSG_NONE) {
620 else if (r == 0) 589 if (no_retry)
621 r = -EFAULT; 590 return 0;
591
592 hv_hdr.message_flags.msg_pending = 1;
593 r = kvm_vcpu_write_guest_page(vcpu, msg_page_gfn,
594 &hv_hdr.message_flags,
595 msg_off +
596 offsetof(struct hv_message,
597 header.message_flags),
598 sizeof(hv_hdr.message_flags));
599 if (r < 0)
600 return r;
601 return -EAGAIN;
622 } 602 }
623 kunmap_atomic(msg_page); 603
624 kvm_release_page_dirty(page); 604 r = kvm_vcpu_write_guest_page(vcpu, msg_page_gfn, src_msg, msg_off,
625 kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT); 605 sizeof(src_msg->header) +
626 return r; 606 src_msg->header.payload_size);
607 if (r < 0)
608 return r;
609
610 r = synic_set_irq(synic, sint);
611 if (r < 0)
612 return r;
613 if (r == 0)
614 return -EFAULT;
615 return 0;
627} 616}
628 617
629static int stimer_send_msg(struct kvm_vcpu_hv_stimer *stimer) 618static int stimer_send_msg(struct kvm_vcpu_hv_stimer *stimer)
@@ -633,24 +622,45 @@ static int stimer_send_msg(struct kvm_vcpu_hv_stimer *stimer)
633 struct hv_timer_message_payload *payload = 622 struct hv_timer_message_payload *payload =
634 (struct hv_timer_message_payload *)&msg->u.payload; 623 (struct hv_timer_message_payload *)&msg->u.payload;
635 624
625 /*
626 * To avoid piling up periodic ticks, don't retry message
627 * delivery for them (within "lazy" lost ticks policy).
628 */
629 bool no_retry = stimer->config.periodic;
630
636 payload->expiration_time = stimer->exp_time; 631 payload->expiration_time = stimer->exp_time;
637 payload->delivery_time = get_time_ref_counter(vcpu->kvm); 632 payload->delivery_time = get_time_ref_counter(vcpu->kvm);
638 return synic_deliver_msg(vcpu_to_synic(vcpu), 633 return synic_deliver_msg(vcpu_to_synic(vcpu),
639 HV_STIMER_SINT(stimer->config), msg); 634 stimer->config.sintx, msg,
635 no_retry);
636}
637
638static int stimer_notify_direct(struct kvm_vcpu_hv_stimer *stimer)
639{
640 struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer);
641 struct kvm_lapic_irq irq = {
642 .delivery_mode = APIC_DM_FIXED,
643 .vector = stimer->config.apic_vector
644 };
645
646 return !kvm_apic_set_irq(vcpu, &irq, NULL);
640} 647}
641 648
642static void stimer_expiration(struct kvm_vcpu_hv_stimer *stimer) 649static void stimer_expiration(struct kvm_vcpu_hv_stimer *stimer)
643{ 650{
644 int r; 651 int r, direct = stimer->config.direct_mode;
645 652
646 stimer->msg_pending = true; 653 stimer->msg_pending = true;
647 r = stimer_send_msg(stimer); 654 if (!direct)
655 r = stimer_send_msg(stimer);
656 else
657 r = stimer_notify_direct(stimer);
648 trace_kvm_hv_stimer_expiration(stimer_to_vcpu(stimer)->vcpu_id, 658 trace_kvm_hv_stimer_expiration(stimer_to_vcpu(stimer)->vcpu_id,
649 stimer->index, r); 659 stimer->index, direct, r);
650 if (!r) { 660 if (!r) {
651 stimer->msg_pending = false; 661 stimer->msg_pending = false;
652 if (!(stimer->config & HV_STIMER_PERIODIC)) 662 if (!(stimer->config.periodic))
653 stimer->config &= ~HV_STIMER_ENABLE; 663 stimer->config.enable = 0;
654 } 664 }
655} 665}
656 666
@@ -664,7 +674,7 @@ void kvm_hv_process_stimers(struct kvm_vcpu *vcpu)
664 for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++) 674 for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++)
665 if (test_and_clear_bit(i, hv_vcpu->stimer_pending_bitmap)) { 675 if (test_and_clear_bit(i, hv_vcpu->stimer_pending_bitmap)) {
666 stimer = &hv_vcpu->stimer[i]; 676 stimer = &hv_vcpu->stimer[i];
667 if (stimer->config & HV_STIMER_ENABLE) { 677 if (stimer->config.enable) {
668 exp_time = stimer->exp_time; 678 exp_time = stimer->exp_time;
669 679
670 if (exp_time) { 680 if (exp_time) {
@@ -674,7 +684,7 @@ void kvm_hv_process_stimers(struct kvm_vcpu *vcpu)
674 stimer_expiration(stimer); 684 stimer_expiration(stimer);
675 } 685 }
676 686
677 if ((stimer->config & HV_STIMER_ENABLE) && 687 if ((stimer->config.enable) &&
678 stimer->count) { 688 stimer->count) {
679 if (!stimer->msg_pending) 689 if (!stimer->msg_pending)
680 stimer_start(stimer); 690 stimer_start(stimer);
@@ -815,9 +825,9 @@ static int kvm_hv_msr_set_crash_ctl(struct kvm_vcpu *vcpu, u64 data, bool host)
815 struct kvm_hv *hv = &vcpu->kvm->arch.hyperv; 825 struct kvm_hv *hv = &vcpu->kvm->arch.hyperv;
816 826
817 if (host) 827 if (host)
818 hv->hv_crash_ctl = data & HV_X64_MSR_CRASH_CTL_NOTIFY; 828 hv->hv_crash_ctl = data & HV_CRASH_CTL_CRASH_NOTIFY;
819 829
820 if (!host && (data & HV_X64_MSR_CRASH_CTL_NOTIFY)) { 830 if (!host && (data & HV_CRASH_CTL_CRASH_NOTIFY)) {
821 831
822 vcpu_debug(vcpu, "hv crash (0x%llx 0x%llx 0x%llx 0x%llx 0x%llx)\n", 832 vcpu_debug(vcpu, "hv crash (0x%llx 0x%llx 0x%llx 0x%llx 0x%llx)\n",
823 hv->hv_crash_param[0], 833 hv->hv_crash_param[0],
@@ -1758,3 +1768,124 @@ int kvm_vm_ioctl_hv_eventfd(struct kvm *kvm, struct kvm_hyperv_eventfd *args)
1758 return kvm_hv_eventfd_deassign(kvm, args->conn_id); 1768 return kvm_hv_eventfd_deassign(kvm, args->conn_id);
1759 return kvm_hv_eventfd_assign(kvm, args->conn_id, args->fd); 1769 return kvm_hv_eventfd_assign(kvm, args->conn_id, args->fd);
1760} 1770}
1771
1772int kvm_vcpu_ioctl_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
1773 struct kvm_cpuid_entry2 __user *entries)
1774{
1775 uint16_t evmcs_ver = kvm_x86_ops->nested_get_evmcs_version(vcpu);
1776 struct kvm_cpuid_entry2 cpuid_entries[] = {
1777 { .function = HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS },
1778 { .function = HYPERV_CPUID_INTERFACE },
1779 { .function = HYPERV_CPUID_VERSION },
1780 { .function = HYPERV_CPUID_FEATURES },
1781 { .function = HYPERV_CPUID_ENLIGHTMENT_INFO },
1782 { .function = HYPERV_CPUID_IMPLEMENT_LIMITS },
1783 { .function = HYPERV_CPUID_NESTED_FEATURES },
1784 };
1785 int i, nent = ARRAY_SIZE(cpuid_entries);
1786
1787 /* Skip NESTED_FEATURES if eVMCS is not supported */
1788 if (!evmcs_ver)
1789 --nent;
1790
1791 if (cpuid->nent < nent)
1792 return -E2BIG;
1793
1794 if (cpuid->nent > nent)
1795 cpuid->nent = nent;
1796
1797 for (i = 0; i < nent; i++) {
1798 struct kvm_cpuid_entry2 *ent = &cpuid_entries[i];
1799 u32 signature[3];
1800
1801 switch (ent->function) {
1802 case HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS:
1803 memcpy(signature, "Linux KVM Hv", 12);
1804
1805 ent->eax = HYPERV_CPUID_NESTED_FEATURES;
1806 ent->ebx = signature[0];
1807 ent->ecx = signature[1];
1808 ent->edx = signature[2];
1809 break;
1810
1811 case HYPERV_CPUID_INTERFACE:
1812 memcpy(signature, "Hv#1\0\0\0\0\0\0\0\0", 12);
1813 ent->eax = signature[0];
1814 break;
1815
1816 case HYPERV_CPUID_VERSION:
1817 /*
1818 * We implement some Hyper-V 2016 functions so let's use
1819 * this version.
1820 */
1821 ent->eax = 0x00003839;
1822 ent->ebx = 0x000A0000;
1823 break;
1824
1825 case HYPERV_CPUID_FEATURES:
1826 ent->eax |= HV_X64_MSR_VP_RUNTIME_AVAILABLE;
1827 ent->eax |= HV_MSR_TIME_REF_COUNT_AVAILABLE;
1828 ent->eax |= HV_X64_MSR_SYNIC_AVAILABLE;
1829 ent->eax |= HV_MSR_SYNTIMER_AVAILABLE;
1830 ent->eax |= HV_X64_MSR_APIC_ACCESS_AVAILABLE;
1831 ent->eax |= HV_X64_MSR_HYPERCALL_AVAILABLE;
1832 ent->eax |= HV_X64_MSR_VP_INDEX_AVAILABLE;
1833 ent->eax |= HV_X64_MSR_RESET_AVAILABLE;
1834 ent->eax |= HV_MSR_REFERENCE_TSC_AVAILABLE;
1835 ent->eax |= HV_X64_MSR_GUEST_IDLE_AVAILABLE;
1836 ent->eax |= HV_X64_ACCESS_FREQUENCY_MSRS;
1837 ent->eax |= HV_X64_ACCESS_REENLIGHTENMENT;
1838
1839 ent->ebx |= HV_X64_POST_MESSAGES;
1840 ent->ebx |= HV_X64_SIGNAL_EVENTS;
1841
1842 ent->edx |= HV_FEATURE_FREQUENCY_MSRS_AVAILABLE;
1843 ent->edx |= HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE;
1844 ent->edx |= HV_STIMER_DIRECT_MODE_AVAILABLE;
1845
1846 break;
1847
1848 case HYPERV_CPUID_ENLIGHTMENT_INFO:
1849 ent->eax |= HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED;
1850 ent->eax |= HV_X64_APIC_ACCESS_RECOMMENDED;
1851 ent->eax |= HV_X64_SYSTEM_RESET_RECOMMENDED;
1852 ent->eax |= HV_X64_RELAXED_TIMING_RECOMMENDED;
1853 ent->eax |= HV_X64_CLUSTER_IPI_RECOMMENDED;
1854 ent->eax |= HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED;
1855 ent->eax |= HV_X64_ENLIGHTENED_VMCS_RECOMMENDED;
1856
1857 /*
1858 * Default number of spinlock retry attempts, matches
1859 * HyperV 2016.
1860 */
1861 ent->ebx = 0x00000FFF;
1862
1863 break;
1864
1865 case HYPERV_CPUID_IMPLEMENT_LIMITS:
1866 /* Maximum number of virtual processors */
1867 ent->eax = KVM_MAX_VCPUS;
1868 /*
1869 * Maximum number of logical processors, matches
1870 * HyperV 2016.
1871 */
1872 ent->ebx = 64;
1873
1874 break;
1875
1876 case HYPERV_CPUID_NESTED_FEATURES:
1877 ent->eax = evmcs_ver;
1878
1879 break;
1880
1881 default:
1882 break;
1883 }
1884 }
1885
1886 if (copy_to_user(entries, cpuid_entries,
1887 nent * sizeof(struct kvm_cpuid_entry2)))
1888 return -EFAULT;
1889
1890 return 0;
1891}
diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h
index 0e66c12ed2c3..fd7cf13a2144 100644
--- a/arch/x86/kvm/hyperv.h
+++ b/arch/x86/kvm/hyperv.h
@@ -24,6 +24,8 @@
24#ifndef __ARCH_X86_KVM_HYPERV_H__ 24#ifndef __ARCH_X86_KVM_HYPERV_H__
25#define __ARCH_X86_KVM_HYPERV_H__ 25#define __ARCH_X86_KVM_HYPERV_H__
26 26
27#include <linux/kvm_host.h>
28
27static inline struct kvm_vcpu_hv *vcpu_to_hv_vcpu(struct kvm_vcpu *vcpu) 29static inline struct kvm_vcpu_hv *vcpu_to_hv_vcpu(struct kvm_vcpu *vcpu)
28{ 30{
29 return &vcpu->arch.hyperv; 31 return &vcpu->arch.hyperv;
@@ -95,5 +97,7 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm,
95void kvm_hv_init_vm(struct kvm *kvm); 97void kvm_hv_init_vm(struct kvm *kvm);
96void kvm_hv_destroy_vm(struct kvm *kvm); 98void kvm_hv_destroy_vm(struct kvm *kvm);
97int kvm_vm_ioctl_hv_eventfd(struct kvm *kvm, struct kvm_hyperv_eventfd *args); 99int kvm_vm_ioctl_hv_eventfd(struct kvm *kvm, struct kvm_hyperv_eventfd *args);
100int kvm_vcpu_ioctl_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
101 struct kvm_cpuid_entry2 __user *entries);
98 102
99#endif 103#endif
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index 9619dcc2b325..f8f56a93358b 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -2,6 +2,8 @@
2#ifndef ASM_KVM_CACHE_REGS_H 2#ifndef ASM_KVM_CACHE_REGS_H
3#define ASM_KVM_CACHE_REGS_H 3#define ASM_KVM_CACHE_REGS_H
4 4
5#include <linux/kvm_host.h>
6
5#define KVM_POSSIBLE_CR0_GUEST_BITS X86_CR0_TS 7#define KVM_POSSIBLE_CR0_GUEST_BITS X86_CR0_TS
6#define KVM_POSSIBLE_CR4_GUEST_BITS \ 8#define KVM_POSSIBLE_CR4_GUEST_BITS \
7 (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \ 9 (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index c4533d05c214..9f089e2e09d0 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -251,10 +251,9 @@ static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val)
251 251
252 if (enabled != apic->sw_enabled) { 252 if (enabled != apic->sw_enabled) {
253 apic->sw_enabled = enabled; 253 apic->sw_enabled = enabled;
254 if (enabled) { 254 if (enabled)
255 static_key_slow_dec_deferred(&apic_sw_disabled); 255 static_key_slow_dec_deferred(&apic_sw_disabled);
256 recalculate_apic_map(apic->vcpu->kvm); 256 else
257 } else
258 static_key_slow_inc(&apic_sw_disabled.key); 257 static_key_slow_inc(&apic_sw_disabled.key);
259 } 258 }
260} 259}
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 7c03c0f35444..ce770b446238 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -264,6 +264,35 @@ static void mmu_spte_set(u64 *sptep, u64 spte);
264static union kvm_mmu_page_role 264static union kvm_mmu_page_role
265kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu); 265kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu);
266 266
267
268static inline bool kvm_available_flush_tlb_with_range(void)
269{
270 return kvm_x86_ops->tlb_remote_flush_with_range;
271}
272
273static void kvm_flush_remote_tlbs_with_range(struct kvm *kvm,
274 struct kvm_tlb_range *range)
275{
276 int ret = -ENOTSUPP;
277
278 if (range && kvm_x86_ops->tlb_remote_flush_with_range)
279 ret = kvm_x86_ops->tlb_remote_flush_with_range(kvm, range);
280
281 if (ret)
282 kvm_flush_remote_tlbs(kvm);
283}
284
285static void kvm_flush_remote_tlbs_with_address(struct kvm *kvm,
286 u64 start_gfn, u64 pages)
287{
288 struct kvm_tlb_range range;
289
290 range.start_gfn = start_gfn;
291 range.pages = pages;
292
293 kvm_flush_remote_tlbs_with_range(kvm, &range);
294}
295
267void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value) 296void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value)
268{ 297{
269 BUG_ON((mmio_mask & mmio_value) != mmio_value); 298 BUG_ON((mmio_mask & mmio_value) != mmio_value);
@@ -1456,8 +1485,12 @@ static bool __drop_large_spte(struct kvm *kvm, u64 *sptep)
1456 1485
1457static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) 1486static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
1458{ 1487{
1459 if (__drop_large_spte(vcpu->kvm, sptep)) 1488 if (__drop_large_spte(vcpu->kvm, sptep)) {
1460 kvm_flush_remote_tlbs(vcpu->kvm); 1489 struct kvm_mmu_page *sp = page_header(__pa(sptep));
1490
1491 kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn,
1492 KVM_PAGES_PER_HPAGE(sp->role.level));
1493 }
1461} 1494}
1462 1495
1463/* 1496/*
@@ -1743,10 +1776,12 @@ restart:
1743 } 1776 }
1744 } 1777 }
1745 1778
1746 if (need_flush) 1779 if (need_flush && kvm_available_flush_tlb_with_range()) {
1747 kvm_flush_remote_tlbs(kvm); 1780 kvm_flush_remote_tlbs_with_address(kvm, gfn, 1);
1781 return 0;
1782 }
1748 1783
1749 return 0; 1784 return need_flush;
1750} 1785}
1751 1786
1752struct slot_rmap_walk_iterator { 1787struct slot_rmap_walk_iterator {
@@ -1880,9 +1915,9 @@ int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end)
1880 return kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp); 1915 return kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp);
1881} 1916}
1882 1917
1883void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) 1918int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
1884{ 1919{
1885 kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp); 1920 return kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp);
1886} 1921}
1887 1922
1888static int kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, 1923static int kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
@@ -1925,7 +1960,8 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
1925 rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp); 1960 rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp);
1926 1961
1927 kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, 0); 1962 kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, 0);
1928 kvm_flush_remote_tlbs(vcpu->kvm); 1963 kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn,
1964 KVM_PAGES_PER_HPAGE(sp->role.level));
1929} 1965}
1930 1966
1931int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) 1967int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
@@ -2441,7 +2477,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
2441 account_shadowed(vcpu->kvm, sp); 2477 account_shadowed(vcpu->kvm, sp);
2442 if (level == PT_PAGE_TABLE_LEVEL && 2478 if (level == PT_PAGE_TABLE_LEVEL &&
2443 rmap_write_protect(vcpu, gfn)) 2479 rmap_write_protect(vcpu, gfn))
2444 kvm_flush_remote_tlbs(vcpu->kvm); 2480 kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn, 1);
2445 2481
2446 if (level > PT_PAGE_TABLE_LEVEL && need_sync) 2482 if (level > PT_PAGE_TABLE_LEVEL && need_sync)
2447 flush |= kvm_sync_pages(vcpu, gfn, &invalid_list); 2483 flush |= kvm_sync_pages(vcpu, gfn, &invalid_list);
@@ -2561,7 +2597,7 @@ static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2561 return; 2597 return;
2562 2598
2563 drop_parent_pte(child, sptep); 2599 drop_parent_pte(child, sptep);
2564 kvm_flush_remote_tlbs(vcpu->kvm); 2600 kvm_flush_remote_tlbs_with_address(vcpu->kvm, child->gfn, 1);
2565 } 2601 }
2566} 2602}
2567 2603
@@ -2985,8 +3021,10 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,
2985 ret = RET_PF_EMULATE; 3021 ret = RET_PF_EMULATE;
2986 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); 3022 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
2987 } 3023 }
3024
2988 if (set_spte_ret & SET_SPTE_NEED_REMOTE_TLB_FLUSH || flush) 3025 if (set_spte_ret & SET_SPTE_NEED_REMOTE_TLB_FLUSH || flush)
2989 kvm_flush_remote_tlbs(vcpu->kvm); 3026 kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn,
3027 KVM_PAGES_PER_HPAGE(level));
2990 3028
2991 if (unlikely(is_mmio_spte(*sptep))) 3029 if (unlikely(is_mmio_spte(*sptep)))
2992 ret = RET_PF_EMULATE; 3030 ret = RET_PF_EMULATE;
@@ -5586,8 +5624,13 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
5586{ 5624{
5587 struct kvm_memslots *slots; 5625 struct kvm_memslots *slots;
5588 struct kvm_memory_slot *memslot; 5626 struct kvm_memory_slot *memslot;
5627 bool flush_tlb = true;
5628 bool flush = false;
5589 int i; 5629 int i;
5590 5630
5631 if (kvm_available_flush_tlb_with_range())
5632 flush_tlb = false;
5633
5591 spin_lock(&kvm->mmu_lock); 5634 spin_lock(&kvm->mmu_lock);
5592 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { 5635 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
5593 slots = __kvm_memslots(kvm, i); 5636 slots = __kvm_memslots(kvm, i);
@@ -5599,12 +5642,17 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
5599 if (start >= end) 5642 if (start >= end)
5600 continue; 5643 continue;
5601 5644
5602 slot_handle_level_range(kvm, memslot, kvm_zap_rmapp, 5645 flush |= slot_handle_level_range(kvm, memslot,
5603 PT_PAGE_TABLE_LEVEL, PT_MAX_HUGEPAGE_LEVEL, 5646 kvm_zap_rmapp, PT_PAGE_TABLE_LEVEL,
5604 start, end - 1, true); 5647 PT_MAX_HUGEPAGE_LEVEL, start,
5648 end - 1, flush_tlb);
5605 } 5649 }
5606 } 5650 }
5607 5651
5652 if (flush)
5653 kvm_flush_remote_tlbs_with_address(kvm, gfn_start,
5654 gfn_end - gfn_start + 1);
5655
5608 spin_unlock(&kvm->mmu_lock); 5656 spin_unlock(&kvm->mmu_lock);
5609} 5657}
5610 5658
@@ -5638,12 +5686,13 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
5638 * spte from present to present (changing the spte from present 5686 * spte from present to present (changing the spte from present
5639 * to nonpresent will flush all the TLBs immediately), in other 5687 * to nonpresent will flush all the TLBs immediately), in other
5640 * words, the only case we care is mmu_spte_update() where we 5688 * words, the only case we care is mmu_spte_update() where we
5641 * haved checked SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE 5689 * have checked SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE
5642 * instead of PT_WRITABLE_MASK, that means it does not depend 5690 * instead of PT_WRITABLE_MASK, that means it does not depend
5643 * on PT_WRITABLE_MASK anymore. 5691 * on PT_WRITABLE_MASK anymore.
5644 */ 5692 */
5645 if (flush) 5693 if (flush)
5646 kvm_flush_remote_tlbs(kvm); 5694 kvm_flush_remote_tlbs_with_address(kvm, memslot->base_gfn,
5695 memslot->npages);
5647} 5696}
5648 5697
5649static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm, 5698static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
@@ -5671,7 +5720,13 @@ restart:
5671 !kvm_is_reserved_pfn(pfn) && 5720 !kvm_is_reserved_pfn(pfn) &&
5672 PageTransCompoundMap(pfn_to_page(pfn))) { 5721 PageTransCompoundMap(pfn_to_page(pfn))) {
5673 pte_list_remove(rmap_head, sptep); 5722 pte_list_remove(rmap_head, sptep);
5674 need_tlb_flush = 1; 5723
5724 if (kvm_available_flush_tlb_with_range())
5725 kvm_flush_remote_tlbs_with_address(kvm, sp->gfn,
5726 KVM_PAGES_PER_HPAGE(sp->role.level));
5727 else
5728 need_tlb_flush = 1;
5729
5675 goto restart; 5730 goto restart;
5676 } 5731 }
5677 } 5732 }
@@ -5707,7 +5762,8 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
5707 * dirty_bitmap. 5762 * dirty_bitmap.
5708 */ 5763 */
5709 if (flush) 5764 if (flush)
5710 kvm_flush_remote_tlbs(kvm); 5765 kvm_flush_remote_tlbs_with_address(kvm, memslot->base_gfn,
5766 memslot->npages);
5711} 5767}
5712EXPORT_SYMBOL_GPL(kvm_mmu_slot_leaf_clear_dirty); 5768EXPORT_SYMBOL_GPL(kvm_mmu_slot_leaf_clear_dirty);
5713 5769
@@ -5725,7 +5781,8 @@ void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm,
5725 lockdep_assert_held(&kvm->slots_lock); 5781 lockdep_assert_held(&kvm->slots_lock);
5726 5782
5727 if (flush) 5783 if (flush)
5728 kvm_flush_remote_tlbs(kvm); 5784 kvm_flush_remote_tlbs_with_address(kvm, memslot->base_gfn,
5785 memslot->npages);
5729} 5786}
5730EXPORT_SYMBOL_GPL(kvm_mmu_slot_largepage_remove_write_access); 5787EXPORT_SYMBOL_GPL(kvm_mmu_slot_largepage_remove_write_access);
5731 5788
@@ -5742,7 +5799,8 @@ void kvm_mmu_slot_set_dirty(struct kvm *kvm,
5742 5799
5743 /* see kvm_mmu_slot_leaf_clear_dirty */ 5800 /* see kvm_mmu_slot_leaf_clear_dirty */
5744 if (flush) 5801 if (flush)
5745 kvm_flush_remote_tlbs(kvm); 5802 kvm_flush_remote_tlbs_with_address(kvm, memslot->base_gfn,
5803 memslot->npages);
5746} 5804}
5747EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty); 5805EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty);
5748 5806
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 7cf2185b7eb5..6bdca39829bc 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -894,7 +894,8 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa)
894 pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t); 894 pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t);
895 895
896 if (mmu_page_zap_pte(vcpu->kvm, sp, sptep)) 896 if (mmu_page_zap_pte(vcpu->kvm, sp, sptep))
897 kvm_flush_remote_tlbs(vcpu->kvm); 897 kvm_flush_remote_tlbs_with_address(vcpu->kvm,
898 sp->gfn, KVM_PAGES_PER_HPAGE(sp->role.level));
898 899
899 if (!rmap_can_add(vcpu)) 900 if (!rmap_can_add(vcpu))
900 break; 901 break;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 101f53ccf571..307e5bddb6d9 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -675,11 +675,6 @@ struct svm_cpu_data {
675 675
676static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data); 676static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data);
677 677
678struct svm_init_data {
679 int cpu;
680 int r;
681};
682
683static const u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000}; 678static const u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
684 679
685#define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges) 680#define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges)
@@ -711,17 +706,17 @@ static u32 svm_msrpm_offset(u32 msr)
711 706
712static inline void clgi(void) 707static inline void clgi(void)
713{ 708{
714 asm volatile (__ex(SVM_CLGI)); 709 asm volatile (__ex("clgi"));
715} 710}
716 711
717static inline void stgi(void) 712static inline void stgi(void)
718{ 713{
719 asm volatile (__ex(SVM_STGI)); 714 asm volatile (__ex("stgi"));
720} 715}
721 716
722static inline void invlpga(unsigned long addr, u32 asid) 717static inline void invlpga(unsigned long addr, u32 asid)
723{ 718{
724 asm volatile (__ex(SVM_INVLPGA) : : "a"(addr), "c"(asid)); 719 asm volatile (__ex("invlpga %1, %0") : : "c"(asid), "a"(addr));
725} 720}
726 721
727static int get_npt_level(struct kvm_vcpu *vcpu) 722static int get_npt_level(struct kvm_vcpu *vcpu)
@@ -1456,10 +1451,11 @@ static u64 svm_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
1456 g_tsc_offset = svm->vmcb->control.tsc_offset - 1451 g_tsc_offset = svm->vmcb->control.tsc_offset -
1457 svm->nested.hsave->control.tsc_offset; 1452 svm->nested.hsave->control.tsc_offset;
1458 svm->nested.hsave->control.tsc_offset = offset; 1453 svm->nested.hsave->control.tsc_offset = offset;
1459 } else 1454 }
1460 trace_kvm_write_tsc_offset(vcpu->vcpu_id, 1455
1461 svm->vmcb->control.tsc_offset, 1456 trace_kvm_write_tsc_offset(vcpu->vcpu_id,
1462 offset); 1457 svm->vmcb->control.tsc_offset - g_tsc_offset,
1458 offset);
1463 1459
1464 svm->vmcb->control.tsc_offset = offset + g_tsc_offset; 1460 svm->vmcb->control.tsc_offset = offset + g_tsc_offset;
1465 1461
@@ -2129,6 +2125,13 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
2129 goto out; 2125 goto out;
2130 } 2126 }
2131 2127
2128 svm->vcpu.arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache, GFP_KERNEL);
2129 if (!svm->vcpu.arch.guest_fpu) {
2130 printk(KERN_ERR "kvm: failed to allocate vcpu's fpu\n");
2131 err = -ENOMEM;
2132 goto free_partial_svm;
2133 }
2134
2132 err = kvm_vcpu_init(&svm->vcpu, kvm, id); 2135 err = kvm_vcpu_init(&svm->vcpu, kvm, id);
2133 if (err) 2136 if (err)
2134 goto free_svm; 2137 goto free_svm;
@@ -2188,6 +2191,8 @@ free_page1:
2188uninit: 2191uninit:
2189 kvm_vcpu_uninit(&svm->vcpu); 2192 kvm_vcpu_uninit(&svm->vcpu);
2190free_svm: 2193free_svm:
2194 kmem_cache_free(x86_fpu_cache, svm->vcpu.arch.guest_fpu);
2195free_partial_svm:
2191 kmem_cache_free(kvm_vcpu_cache, svm); 2196 kmem_cache_free(kvm_vcpu_cache, svm);
2192out: 2197out:
2193 return ERR_PTR(err); 2198 return ERR_PTR(err);
@@ -2217,6 +2222,7 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu)
2217 __free_page(virt_to_page(svm->nested.hsave)); 2222 __free_page(virt_to_page(svm->nested.hsave));
2218 __free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER); 2223 __free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER);
2219 kvm_vcpu_uninit(vcpu); 2224 kvm_vcpu_uninit(vcpu);
2225 kmem_cache_free(x86_fpu_cache, svm->vcpu.arch.guest_fpu);
2220 kmem_cache_free(kvm_vcpu_cache, svm); 2226 kmem_cache_free(kvm_vcpu_cache, svm);
2221} 2227}
2222 2228
@@ -3278,6 +3284,8 @@ static inline void copy_vmcb_control_area(struct vmcb *dst_vmcb, struct vmcb *fr
3278 dst->event_inj_err = from->event_inj_err; 3284 dst->event_inj_err = from->event_inj_err;
3279 dst->nested_cr3 = from->nested_cr3; 3285 dst->nested_cr3 = from->nested_cr3;
3280 dst->virt_ext = from->virt_ext; 3286 dst->virt_ext = from->virt_ext;
3287 dst->pause_filter_count = from->pause_filter_count;
3288 dst->pause_filter_thresh = from->pause_filter_thresh;
3281} 3289}
3282 3290
3283static int nested_svm_vmexit(struct vcpu_svm *svm) 3291static int nested_svm_vmexit(struct vcpu_svm *svm)
@@ -3356,6 +3364,11 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
3356 nested_vmcb->control.event_inj = 0; 3364 nested_vmcb->control.event_inj = 0;
3357 nested_vmcb->control.event_inj_err = 0; 3365 nested_vmcb->control.event_inj_err = 0;
3358 3366
3367 nested_vmcb->control.pause_filter_count =
3368 svm->vmcb->control.pause_filter_count;
3369 nested_vmcb->control.pause_filter_thresh =
3370 svm->vmcb->control.pause_filter_thresh;
3371
3359 /* We always set V_INTR_MASKING and remember the old value in hflags */ 3372 /* We always set V_INTR_MASKING and remember the old value in hflags */
3360 if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK)) 3373 if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK))
3361 nested_vmcb->control.int_ctl &= ~V_INTR_MASKING_MASK; 3374 nested_vmcb->control.int_ctl &= ~V_INTR_MASKING_MASK;
@@ -3532,6 +3545,11 @@ static void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
3532 svm->vmcb->control.event_inj = nested_vmcb->control.event_inj; 3545 svm->vmcb->control.event_inj = nested_vmcb->control.event_inj;
3533 svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err; 3546 svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err;
3534 3547
3548 svm->vmcb->control.pause_filter_count =
3549 nested_vmcb->control.pause_filter_count;
3550 svm->vmcb->control.pause_filter_thresh =
3551 nested_vmcb->control.pause_filter_thresh;
3552
3535 nested_svm_unmap(page); 3553 nested_svm_unmap(page);
3536 3554
3537 /* Enter Guest-Mode */ 3555 /* Enter Guest-Mode */
@@ -5636,9 +5654,9 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
5636 /* Enter guest mode */ 5654 /* Enter guest mode */
5637 "push %%" _ASM_AX " \n\t" 5655 "push %%" _ASM_AX " \n\t"
5638 "mov %c[vmcb](%[svm]), %%" _ASM_AX " \n\t" 5656 "mov %c[vmcb](%[svm]), %%" _ASM_AX " \n\t"
5639 __ex(SVM_VMLOAD) "\n\t" 5657 __ex("vmload %%" _ASM_AX) "\n\t"
5640 __ex(SVM_VMRUN) "\n\t" 5658 __ex("vmrun %%" _ASM_AX) "\n\t"
5641 __ex(SVM_VMSAVE) "\n\t" 5659 __ex("vmsave %%" _ASM_AX) "\n\t"
5642 "pop %%" _ASM_AX " \n\t" 5660 "pop %%" _ASM_AX " \n\t"
5643 5661
5644 /* Save guest registers, load host registers */ 5662 /* Save guest registers, load host registers */
@@ -5836,6 +5854,13 @@ static bool svm_cpu_has_accelerated_tpr(void)
5836 5854
5837static bool svm_has_emulated_msr(int index) 5855static bool svm_has_emulated_msr(int index)
5838{ 5856{
5857 switch (index) {
5858 case MSR_IA32_MCG_EXT_CTL:
5859 return false;
5860 default:
5861 break;
5862 }
5863
5839 return true; 5864 return true;
5840} 5865}
5841 5866
@@ -5924,6 +5949,11 @@ static bool svm_umip_emulated(void)
5924 return false; 5949 return false;
5925} 5950}
5926 5951
5952static bool svm_pt_supported(void)
5953{
5954 return false;
5955}
5956
5927static bool svm_has_wbinvd_exit(void) 5957static bool svm_has_wbinvd_exit(void)
5928{ 5958{
5929 return true; 5959 return true;
@@ -7053,6 +7083,12 @@ failed:
7053 return ret; 7083 return ret;
7054} 7084}
7055 7085
7086static uint16_t nested_get_evmcs_version(struct kvm_vcpu *vcpu)
7087{
7088 /* Not supported */
7089 return 0;
7090}
7091
7056static int nested_enable_evmcs(struct kvm_vcpu *vcpu, 7092static int nested_enable_evmcs(struct kvm_vcpu *vcpu,
7057 uint16_t *vmcs_version) 7093 uint16_t *vmcs_version)
7058{ 7094{
@@ -7159,6 +7195,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
7159 .mpx_supported = svm_mpx_supported, 7195 .mpx_supported = svm_mpx_supported,
7160 .xsaves_supported = svm_xsaves_supported, 7196 .xsaves_supported = svm_xsaves_supported,
7161 .umip_emulated = svm_umip_emulated, 7197 .umip_emulated = svm_umip_emulated,
7198 .pt_supported = svm_pt_supported,
7162 7199
7163 .set_supported_cpuid = svm_set_supported_cpuid, 7200 .set_supported_cpuid = svm_set_supported_cpuid,
7164 7201
@@ -7191,6 +7228,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
7191 .mem_enc_unreg_region = svm_unregister_enc_region, 7228 .mem_enc_unreg_region = svm_unregister_enc_region,
7192 7229
7193 .nested_enable_evmcs = nested_enable_evmcs, 7230 .nested_enable_evmcs = nested_enable_evmcs,
7231 .nested_get_evmcs_version = nested_get_evmcs_version,
7194}; 7232};
7195 7233
7196static int __init svm_init(void) 7234static int __init svm_init(void)
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 0659465a745c..705f40ae2532 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -1254,24 +1254,26 @@ TRACE_EVENT(kvm_hv_stimer_callback,
1254 * Tracepoint for stimer_expiration. 1254 * Tracepoint for stimer_expiration.
1255 */ 1255 */
1256TRACE_EVENT(kvm_hv_stimer_expiration, 1256TRACE_EVENT(kvm_hv_stimer_expiration,
1257 TP_PROTO(int vcpu_id, int timer_index, int msg_send_result), 1257 TP_PROTO(int vcpu_id, int timer_index, int direct, int msg_send_result),
1258 TP_ARGS(vcpu_id, timer_index, msg_send_result), 1258 TP_ARGS(vcpu_id, timer_index, direct, msg_send_result),
1259 1259
1260 TP_STRUCT__entry( 1260 TP_STRUCT__entry(
1261 __field(int, vcpu_id) 1261 __field(int, vcpu_id)
1262 __field(int, timer_index) 1262 __field(int, timer_index)
1263 __field(int, direct)
1263 __field(int, msg_send_result) 1264 __field(int, msg_send_result)
1264 ), 1265 ),
1265 1266
1266 TP_fast_assign( 1267 TP_fast_assign(
1267 __entry->vcpu_id = vcpu_id; 1268 __entry->vcpu_id = vcpu_id;
1268 __entry->timer_index = timer_index; 1269 __entry->timer_index = timer_index;
1270 __entry->direct = direct;
1269 __entry->msg_send_result = msg_send_result; 1271 __entry->msg_send_result = msg_send_result;
1270 ), 1272 ),
1271 1273
1272 TP_printk("vcpu_id %d timer %d msg send result %d", 1274 TP_printk("vcpu_id %d timer %d direct %d send result %d",
1273 __entry->vcpu_id, __entry->timer_index, 1275 __entry->vcpu_id, __entry->timer_index,
1274 __entry->msg_send_result) 1276 __entry->direct, __entry->msg_send_result)
1275); 1277);
1276 1278
1277/* 1279/*
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
deleted file mode 100644
index 8d5d984541be..000000000000
--- a/arch/x86/kvm/vmx.c
+++ /dev/null
@@ -1,15252 +0,0 @@
1/*
2 * Kernel-based Virtual Machine driver for Linux
3 *
4 * This module enables machines with Intel VT-x extensions to run virtual
5 * machines without emulation or binary translation.
6 *
7 * Copyright (C) 2006 Qumranet, Inc.
8 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
9 *
10 * Authors:
11 * Avi Kivity <avi@qumranet.com>
12 * Yaniv Kamay <yaniv@qumranet.com>
13 *
14 * This work is licensed under the terms of the GNU GPL, version 2. See
15 * the COPYING file in the top-level directory.
16 *
17 */
18
19#include "irq.h"
20#include "mmu.h"
21#include "cpuid.h"
22#include "lapic.h"
23#include "hyperv.h"
24
25#include <linux/kvm_host.h>
26#include <linux/module.h>
27#include <linux/kernel.h>
28#include <linux/mm.h>
29#include <linux/highmem.h>
30#include <linux/sched.h>
31#include <linux/moduleparam.h>
32#include <linux/mod_devicetable.h>
33#include <linux/trace_events.h>
34#include <linux/slab.h>
35#include <linux/tboot.h>
36#include <linux/hrtimer.h>
37#include <linux/frame.h>
38#include <linux/nospec.h>
39#include "kvm_cache_regs.h"
40#include "x86.h"
41
42#include <asm/asm.h>
43#include <asm/cpu.h>
44#include <asm/io.h>
45#include <asm/desc.h>
46#include <asm/vmx.h>
47#include <asm/virtext.h>
48#include <asm/mce.h>
49#include <asm/fpu/internal.h>
50#include <asm/perf_event.h>
51#include <asm/debugreg.h>
52#include <asm/kexec.h>
53#include <asm/apic.h>
54#include <asm/irq_remapping.h>
55#include <asm/mmu_context.h>
56#include <asm/spec-ctrl.h>
57#include <asm/mshyperv.h>
58
59#include "trace.h"
60#include "pmu.h"
61#include "vmx_evmcs.h"
62
63#define __ex(x) __kvm_handle_fault_on_reboot(x)
64#define __ex_clear(x, reg) \
65 ____kvm_handle_fault_on_reboot(x, "xor " reg ", " reg)
66
67MODULE_AUTHOR("Qumranet");
68MODULE_LICENSE("GPL");
69
70static const struct x86_cpu_id vmx_cpu_id[] = {
71 X86_FEATURE_MATCH(X86_FEATURE_VMX),
72 {}
73};
74MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id);
75
76static bool __read_mostly enable_vpid = 1;
77module_param_named(vpid, enable_vpid, bool, 0444);
78
79static bool __read_mostly enable_vnmi = 1;
80module_param_named(vnmi, enable_vnmi, bool, S_IRUGO);
81
82static bool __read_mostly flexpriority_enabled = 1;
83module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO);
84
85static bool __read_mostly enable_ept = 1;
86module_param_named(ept, enable_ept, bool, S_IRUGO);
87
88static bool __read_mostly enable_unrestricted_guest = 1;
89module_param_named(unrestricted_guest,
90 enable_unrestricted_guest, bool, S_IRUGO);
91
92static bool __read_mostly enable_ept_ad_bits = 1;
93module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO);
94
95static bool __read_mostly emulate_invalid_guest_state = true;
96module_param(emulate_invalid_guest_state, bool, S_IRUGO);
97
98static bool __read_mostly fasteoi = 1;
99module_param(fasteoi, bool, S_IRUGO);
100
101static bool __read_mostly enable_apicv = 1;
102module_param(enable_apicv, bool, S_IRUGO);
103
104static bool __read_mostly enable_shadow_vmcs = 1;
105module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO);
106/*
107 * If nested=1, nested virtualization is supported, i.e., guests may use
108 * VMX and be a hypervisor for its own guests. If nested=0, guests may not
109 * use VMX instructions.
110 */
111static bool __read_mostly nested = 1;
112module_param(nested, bool, S_IRUGO);
113
114static bool __read_mostly nested_early_check = 0;
115module_param(nested_early_check, bool, S_IRUGO);
116
117static u64 __read_mostly host_xss;
118
119static bool __read_mostly enable_pml = 1;
120module_param_named(pml, enable_pml, bool, S_IRUGO);
121
122#define MSR_TYPE_R 1
123#define MSR_TYPE_W 2
124#define MSR_TYPE_RW 3
125
126#define MSR_BITMAP_MODE_X2APIC 1
127#define MSR_BITMAP_MODE_X2APIC_APICV 2
128
129#define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL
130
131/* Guest_tsc -> host_tsc conversion requires 64-bit division. */
132static int __read_mostly cpu_preemption_timer_multi;
133static bool __read_mostly enable_preemption_timer = 1;
134#ifdef CONFIG_X86_64
135module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO);
136#endif
137
138#define KVM_VM_CR0_ALWAYS_OFF (X86_CR0_NW | X86_CR0_CD)
139#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE
140#define KVM_VM_CR0_ALWAYS_ON \
141 (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | \
142 X86_CR0_WP | X86_CR0_PG | X86_CR0_PE)
143#define KVM_CR4_GUEST_OWNED_BITS \
144 (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \
145 | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_TSD)
146
147#define KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR4_VMXE
148#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
149#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
150
151#define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM))
152
153#define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5
154
155/*
156 * Hyper-V requires all of these, so mark them as supported even though
157 * they are just treated the same as all-context.
158 */
159#define VMX_VPID_EXTENT_SUPPORTED_MASK \
160 (VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT | \
161 VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT | \
162 VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT | \
163 VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT)
164
165/*
166 * These 2 parameters are used to config the controls for Pause-Loop Exiting:
167 * ple_gap: upper bound on the amount of time between two successive
168 * executions of PAUSE in a loop. Also indicate if ple enabled.
169 * According to test, this time is usually smaller than 128 cycles.
170 * ple_window: upper bound on the amount of time a guest is allowed to execute
171 * in a PAUSE loop. Tests indicate that most spinlocks are held for
172 * less than 2^12 cycles
173 * Time is measured based on a counter that runs at the same rate as the TSC,
174 * refer SDM volume 3b section 21.6.13 & 22.1.3.
175 */
176static unsigned int ple_gap = KVM_DEFAULT_PLE_GAP;
177module_param(ple_gap, uint, 0444);
178
179static unsigned int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
180module_param(ple_window, uint, 0444);
181
182/* Default doubles per-vcpu window every exit. */
183static unsigned int ple_window_grow = KVM_DEFAULT_PLE_WINDOW_GROW;
184module_param(ple_window_grow, uint, 0444);
185
186/* Default resets per-vcpu window every exit to ple_window. */
187static unsigned int ple_window_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK;
188module_param(ple_window_shrink, uint, 0444);
189
190/* Default is to compute the maximum so we can never overflow. */
191static unsigned int ple_window_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX;
192module_param(ple_window_max, uint, 0444);
193
194extern const ulong vmx_return;
195extern const ulong vmx_early_consistency_check_return;
196
197static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush);
198static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond);
199static DEFINE_MUTEX(vmx_l1d_flush_mutex);
200
201/* Storage for pre module init parameter parsing */
202static enum vmx_l1d_flush_state __read_mostly vmentry_l1d_flush_param = VMENTER_L1D_FLUSH_AUTO;
203
204static const struct {
205 const char *option;
206 bool for_parse;
207} vmentry_l1d_param[] = {
208 [VMENTER_L1D_FLUSH_AUTO] = {"auto", true},
209 [VMENTER_L1D_FLUSH_NEVER] = {"never", true},
210 [VMENTER_L1D_FLUSH_COND] = {"cond", true},
211 [VMENTER_L1D_FLUSH_ALWAYS] = {"always", true},
212 [VMENTER_L1D_FLUSH_EPT_DISABLED] = {"EPT disabled", false},
213 [VMENTER_L1D_FLUSH_NOT_REQUIRED] = {"not required", false},
214};
215
216#define L1D_CACHE_ORDER 4
217static void *vmx_l1d_flush_pages;
218
219static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
220{
221 struct page *page;
222 unsigned int i;
223
224 if (!enable_ept) {
225 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED;
226 return 0;
227 }
228
229 if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) {
230 u64 msr;
231
232 rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr);
233 if (msr & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) {
234 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
235 return 0;
236 }
237 }
238
239 /* If set to auto use the default l1tf mitigation method */
240 if (l1tf == VMENTER_L1D_FLUSH_AUTO) {
241 switch (l1tf_mitigation) {
242 case L1TF_MITIGATION_OFF:
243 l1tf = VMENTER_L1D_FLUSH_NEVER;
244 break;
245 case L1TF_MITIGATION_FLUSH_NOWARN:
246 case L1TF_MITIGATION_FLUSH:
247 case L1TF_MITIGATION_FLUSH_NOSMT:
248 l1tf = VMENTER_L1D_FLUSH_COND;
249 break;
250 case L1TF_MITIGATION_FULL:
251 case L1TF_MITIGATION_FULL_FORCE:
252 l1tf = VMENTER_L1D_FLUSH_ALWAYS;
253 break;
254 }
255 } else if (l1tf_mitigation == L1TF_MITIGATION_FULL_FORCE) {
256 l1tf = VMENTER_L1D_FLUSH_ALWAYS;
257 }
258
259 if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages &&
260 !boot_cpu_has(X86_FEATURE_FLUSH_L1D)) {
261 page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER);
262 if (!page)
263 return -ENOMEM;
264 vmx_l1d_flush_pages = page_address(page);
265
266 /*
267 * Initialize each page with a different pattern in
268 * order to protect against KSM in the nested
269 * virtualization case.
270 */
271 for (i = 0; i < 1u << L1D_CACHE_ORDER; ++i) {
272 memset(vmx_l1d_flush_pages + i * PAGE_SIZE, i + 1,
273 PAGE_SIZE);
274 }
275 }
276
277 l1tf_vmx_mitigation = l1tf;
278
279 if (l1tf != VMENTER_L1D_FLUSH_NEVER)
280 static_branch_enable(&vmx_l1d_should_flush);
281 else
282 static_branch_disable(&vmx_l1d_should_flush);
283
284 if (l1tf == VMENTER_L1D_FLUSH_COND)
285 static_branch_enable(&vmx_l1d_flush_cond);
286 else
287 static_branch_disable(&vmx_l1d_flush_cond);
288 return 0;
289}
290
291static int vmentry_l1d_flush_parse(const char *s)
292{
293 unsigned int i;
294
295 if (s) {
296 for (i = 0; i < ARRAY_SIZE(vmentry_l1d_param); i++) {
297 if (vmentry_l1d_param[i].for_parse &&
298 sysfs_streq(s, vmentry_l1d_param[i].option))
299 return i;
300 }
301 }
302 return -EINVAL;
303}
304
305static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp)
306{
307 int l1tf, ret;
308
309 l1tf = vmentry_l1d_flush_parse(s);
310 if (l1tf < 0)
311 return l1tf;
312
313 if (!boot_cpu_has(X86_BUG_L1TF))
314 return 0;
315
316 /*
317 * Has vmx_init() run already? If not then this is the pre init
318 * parameter parsing. In that case just store the value and let
319 * vmx_init() do the proper setup after enable_ept has been
320 * established.
321 */
322 if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO) {
323 vmentry_l1d_flush_param = l1tf;
324 return 0;
325 }
326
327 mutex_lock(&vmx_l1d_flush_mutex);
328 ret = vmx_setup_l1d_flush(l1tf);
329 mutex_unlock(&vmx_l1d_flush_mutex);
330 return ret;
331}
332
333static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp)
334{
335 if (WARN_ON_ONCE(l1tf_vmx_mitigation >= ARRAY_SIZE(vmentry_l1d_param)))
336 return sprintf(s, "???\n");
337
338 return sprintf(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option);
339}
340
341static const struct kernel_param_ops vmentry_l1d_flush_ops = {
342 .set = vmentry_l1d_flush_set,
343 .get = vmentry_l1d_flush_get,
344};
345module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644);
346
347enum ept_pointers_status {
348 EPT_POINTERS_CHECK = 0,
349 EPT_POINTERS_MATCH = 1,
350 EPT_POINTERS_MISMATCH = 2
351};
352
353struct kvm_vmx {
354 struct kvm kvm;
355
356 unsigned int tss_addr;
357 bool ept_identity_pagetable_done;
358 gpa_t ept_identity_map_addr;
359
360 enum ept_pointers_status ept_pointers_match;
361 spinlock_t ept_pointer_lock;
362};
363
364#define NR_AUTOLOAD_MSRS 8
365
366struct vmcs_hdr {
367 u32 revision_id:31;
368 u32 shadow_vmcs:1;
369};
370
371struct vmcs {
372 struct vmcs_hdr hdr;
373 u32 abort;
374 char data[0];
375};
376
377/*
378 * vmcs_host_state tracks registers that are loaded from the VMCS on VMEXIT
379 * and whose values change infrequently, but are not constant. I.e. this is
380 * used as a write-through cache of the corresponding VMCS fields.
381 */
382struct vmcs_host_state {
383 unsigned long cr3; /* May not match real cr3 */
384 unsigned long cr4; /* May not match real cr4 */
385 unsigned long gs_base;
386 unsigned long fs_base;
387
388 u16 fs_sel, gs_sel, ldt_sel;
389#ifdef CONFIG_X86_64
390 u16 ds_sel, es_sel;
391#endif
392};
393
394/*
395 * Track a VMCS that may be loaded on a certain CPU. If it is (cpu!=-1), also
396 * remember whether it was VMLAUNCHed, and maintain a linked list of all VMCSs
397 * loaded on this CPU (so we can clear them if the CPU goes down).
398 */
399struct loaded_vmcs {
400 struct vmcs *vmcs;
401 struct vmcs *shadow_vmcs;
402 int cpu;
403 bool launched;
404 bool nmi_known_unmasked;
405 bool hv_timer_armed;
406 /* Support for vnmi-less CPUs */
407 int soft_vnmi_blocked;
408 ktime_t entry_time;
409 s64 vnmi_blocked_time;
410 unsigned long *msr_bitmap;
411 struct list_head loaded_vmcss_on_cpu_link;
412 struct vmcs_host_state host_state;
413};
414
415struct shared_msr_entry {
416 unsigned index;
417 u64 data;
418 u64 mask;
419};
420
421/*
422 * struct vmcs12 describes the state that our guest hypervisor (L1) keeps for a
423 * single nested guest (L2), hence the name vmcs12. Any VMX implementation has
424 * a VMCS structure, and vmcs12 is our emulated VMX's VMCS. This structure is
425 * stored in guest memory specified by VMPTRLD, but is opaque to the guest,
426 * which must access it using VMREAD/VMWRITE/VMCLEAR instructions.
427 * More than one of these structures may exist, if L1 runs multiple L2 guests.
428 * nested_vmx_run() will use the data here to build the vmcs02: a VMCS for the
429 * underlying hardware which will be used to run L2.
430 * This structure is packed to ensure that its layout is identical across
431 * machines (necessary for live migration).
432 *
433 * IMPORTANT: Changing the layout of existing fields in this structure
434 * will break save/restore compatibility with older kvm releases. When
435 * adding new fields, either use space in the reserved padding* arrays
436 * or add the new fields to the end of the structure.
437 */
438typedef u64 natural_width;
439struct __packed vmcs12 {
440 /* According to the Intel spec, a VMCS region must start with the
441 * following two fields. Then follow implementation-specific data.
442 */
443 struct vmcs_hdr hdr;
444 u32 abort;
445
446 u32 launch_state; /* set to 0 by VMCLEAR, to 1 by VMLAUNCH */
447 u32 padding[7]; /* room for future expansion */
448
449 u64 io_bitmap_a;
450 u64 io_bitmap_b;
451 u64 msr_bitmap;
452 u64 vm_exit_msr_store_addr;
453 u64 vm_exit_msr_load_addr;
454 u64 vm_entry_msr_load_addr;
455 u64 tsc_offset;
456 u64 virtual_apic_page_addr;
457 u64 apic_access_addr;
458 u64 posted_intr_desc_addr;
459 u64 ept_pointer;
460 u64 eoi_exit_bitmap0;
461 u64 eoi_exit_bitmap1;
462 u64 eoi_exit_bitmap2;
463 u64 eoi_exit_bitmap3;
464 u64 xss_exit_bitmap;
465 u64 guest_physical_address;
466 u64 vmcs_link_pointer;
467 u64 guest_ia32_debugctl;
468 u64 guest_ia32_pat;
469 u64 guest_ia32_efer;
470 u64 guest_ia32_perf_global_ctrl;
471 u64 guest_pdptr0;
472 u64 guest_pdptr1;
473 u64 guest_pdptr2;
474 u64 guest_pdptr3;
475 u64 guest_bndcfgs;
476 u64 host_ia32_pat;
477 u64 host_ia32_efer;
478 u64 host_ia32_perf_global_ctrl;
479 u64 vmread_bitmap;
480 u64 vmwrite_bitmap;
481 u64 vm_function_control;
482 u64 eptp_list_address;
483 u64 pml_address;
484 u64 padding64[3]; /* room for future expansion */
485 /*
486 * To allow migration of L1 (complete with its L2 guests) between
487 * machines of different natural widths (32 or 64 bit), we cannot have
488 * unsigned long fields with no explict size. We use u64 (aliased
489 * natural_width) instead. Luckily, x86 is little-endian.
490 */
491 natural_width cr0_guest_host_mask;
492 natural_width cr4_guest_host_mask;
493 natural_width cr0_read_shadow;
494 natural_width cr4_read_shadow;
495 natural_width cr3_target_value0;
496 natural_width cr3_target_value1;
497 natural_width cr3_target_value2;
498 natural_width cr3_target_value3;
499 natural_width exit_qualification;
500 natural_width guest_linear_address;
501 natural_width guest_cr0;
502 natural_width guest_cr3;
503 natural_width guest_cr4;
504 natural_width guest_es_base;
505 natural_width guest_cs_base;
506 natural_width guest_ss_base;
507 natural_width guest_ds_base;
508 natural_width guest_fs_base;
509 natural_width guest_gs_base;
510 natural_width guest_ldtr_base;
511 natural_width guest_tr_base;
512 natural_width guest_gdtr_base;
513 natural_width guest_idtr_base;
514 natural_width guest_dr7;
515 natural_width guest_rsp;
516 natural_width guest_rip;
517 natural_width guest_rflags;
518 natural_width guest_pending_dbg_exceptions;
519 natural_width guest_sysenter_esp;
520 natural_width guest_sysenter_eip;
521 natural_width host_cr0;
522 natural_width host_cr3;
523 natural_width host_cr4;
524 natural_width host_fs_base;
525 natural_width host_gs_base;
526 natural_width host_tr_base;
527 natural_width host_gdtr_base;
528 natural_width host_idtr_base;
529 natural_width host_ia32_sysenter_esp;
530 natural_width host_ia32_sysenter_eip;
531 natural_width host_rsp;
532 natural_width host_rip;
533 natural_width paddingl[8]; /* room for future expansion */
534 u32 pin_based_vm_exec_control;
535 u32 cpu_based_vm_exec_control;
536 u32 exception_bitmap;
537 u32 page_fault_error_code_mask;
538 u32 page_fault_error_code_match;
539 u32 cr3_target_count;
540 u32 vm_exit_controls;
541 u32 vm_exit_msr_store_count;
542 u32 vm_exit_msr_load_count;
543 u32 vm_entry_controls;
544 u32 vm_entry_msr_load_count;
545 u32 vm_entry_intr_info_field;
546 u32 vm_entry_exception_error_code;
547 u32 vm_entry_instruction_len;
548 u32 tpr_threshold;
549 u32 secondary_vm_exec_control;
550 u32 vm_instruction_error;
551 u32 vm_exit_reason;
552 u32 vm_exit_intr_info;
553 u32 vm_exit_intr_error_code;
554 u32 idt_vectoring_info_field;
555 u32 idt_vectoring_error_code;
556 u32 vm_exit_instruction_len;
557 u32 vmx_instruction_info;
558 u32 guest_es_limit;
559 u32 guest_cs_limit;
560 u32 guest_ss_limit;
561 u32 guest_ds_limit;
562 u32 guest_fs_limit;
563 u32 guest_gs_limit;
564 u32 guest_ldtr_limit;
565 u32 guest_tr_limit;
566 u32 guest_gdtr_limit;
567 u32 guest_idtr_limit;
568 u32 guest_es_ar_bytes;
569 u32 guest_cs_ar_bytes;
570 u32 guest_ss_ar_bytes;
571 u32 guest_ds_ar_bytes;
572 u32 guest_fs_ar_bytes;
573 u32 guest_gs_ar_bytes;
574 u32 guest_ldtr_ar_bytes;
575 u32 guest_tr_ar_bytes;
576 u32 guest_interruptibility_info;
577 u32 guest_activity_state;
578 u32 guest_sysenter_cs;
579 u32 host_ia32_sysenter_cs;
580 u32 vmx_preemption_timer_value;
581 u32 padding32[7]; /* room for future expansion */
582 u16 virtual_processor_id;
583 u16 posted_intr_nv;
584 u16 guest_es_selector;
585 u16 guest_cs_selector;
586 u16 guest_ss_selector;
587 u16 guest_ds_selector;
588 u16 guest_fs_selector;
589 u16 guest_gs_selector;
590 u16 guest_ldtr_selector;
591 u16 guest_tr_selector;
592 u16 guest_intr_status;
593 u16 host_es_selector;
594 u16 host_cs_selector;
595 u16 host_ss_selector;
596 u16 host_ds_selector;
597 u16 host_fs_selector;
598 u16 host_gs_selector;
599 u16 host_tr_selector;
600 u16 guest_pml_index;
601};
602
603/*
604 * For save/restore compatibility, the vmcs12 field offsets must not change.
605 */
606#define CHECK_OFFSET(field, loc) \
607 BUILD_BUG_ON_MSG(offsetof(struct vmcs12, field) != (loc), \
608 "Offset of " #field " in struct vmcs12 has changed.")
609
610static inline void vmx_check_vmcs12_offsets(void) {
611 CHECK_OFFSET(hdr, 0);
612 CHECK_OFFSET(abort, 4);
613 CHECK_OFFSET(launch_state, 8);
614 CHECK_OFFSET(io_bitmap_a, 40);
615 CHECK_OFFSET(io_bitmap_b, 48);
616 CHECK_OFFSET(msr_bitmap, 56);
617 CHECK_OFFSET(vm_exit_msr_store_addr, 64);
618 CHECK_OFFSET(vm_exit_msr_load_addr, 72);
619 CHECK_OFFSET(vm_entry_msr_load_addr, 80);
620 CHECK_OFFSET(tsc_offset, 88);
621 CHECK_OFFSET(virtual_apic_page_addr, 96);
622 CHECK_OFFSET(apic_access_addr, 104);
623 CHECK_OFFSET(posted_intr_desc_addr, 112);
624 CHECK_OFFSET(ept_pointer, 120);
625 CHECK_OFFSET(eoi_exit_bitmap0, 128);
626 CHECK_OFFSET(eoi_exit_bitmap1, 136);
627 CHECK_OFFSET(eoi_exit_bitmap2, 144);
628 CHECK_OFFSET(eoi_exit_bitmap3, 152);
629 CHECK_OFFSET(xss_exit_bitmap, 160);
630 CHECK_OFFSET(guest_physical_address, 168);
631 CHECK_OFFSET(vmcs_link_pointer, 176);
632 CHECK_OFFSET(guest_ia32_debugctl, 184);
633 CHECK_OFFSET(guest_ia32_pat, 192);
634 CHECK_OFFSET(guest_ia32_efer, 200);
635 CHECK_OFFSET(guest_ia32_perf_global_ctrl, 208);
636 CHECK_OFFSET(guest_pdptr0, 216);
637 CHECK_OFFSET(guest_pdptr1, 224);
638 CHECK_OFFSET(guest_pdptr2, 232);
639 CHECK_OFFSET(guest_pdptr3, 240);
640 CHECK_OFFSET(guest_bndcfgs, 248);
641 CHECK_OFFSET(host_ia32_pat, 256);
642 CHECK_OFFSET(host_ia32_efer, 264);
643 CHECK_OFFSET(host_ia32_perf_global_ctrl, 272);
644 CHECK_OFFSET(vmread_bitmap, 280);
645 CHECK_OFFSET(vmwrite_bitmap, 288);
646 CHECK_OFFSET(vm_function_control, 296);
647 CHECK_OFFSET(eptp_list_address, 304);
648 CHECK_OFFSET(pml_address, 312);
649 CHECK_OFFSET(cr0_guest_host_mask, 344);
650 CHECK_OFFSET(cr4_guest_host_mask, 352);
651 CHECK_OFFSET(cr0_read_shadow, 360);
652 CHECK_OFFSET(cr4_read_shadow, 368);
653 CHECK_OFFSET(cr3_target_value0, 376);
654 CHECK_OFFSET(cr3_target_value1, 384);
655 CHECK_OFFSET(cr3_target_value2, 392);
656 CHECK_OFFSET(cr3_target_value3, 400);
657 CHECK_OFFSET(exit_qualification, 408);
658 CHECK_OFFSET(guest_linear_address, 416);
659 CHECK_OFFSET(guest_cr0, 424);
660 CHECK_OFFSET(guest_cr3, 432);
661 CHECK_OFFSET(guest_cr4, 440);
662 CHECK_OFFSET(guest_es_base, 448);
663 CHECK_OFFSET(guest_cs_base, 456);
664 CHECK_OFFSET(guest_ss_base, 464);
665 CHECK_OFFSET(guest_ds_base, 472);
666 CHECK_OFFSET(guest_fs_base, 480);
667 CHECK_OFFSET(guest_gs_base, 488);
668 CHECK_OFFSET(guest_ldtr_base, 496);
669 CHECK_OFFSET(guest_tr_base, 504);
670 CHECK_OFFSET(guest_gdtr_base, 512);
671 CHECK_OFFSET(guest_idtr_base, 520);
672 CHECK_OFFSET(guest_dr7, 528);
673 CHECK_OFFSET(guest_rsp, 536);
674 CHECK_OFFSET(guest_rip, 544);
675 CHECK_OFFSET(guest_rflags, 552);
676 CHECK_OFFSET(guest_pending_dbg_exceptions, 560);
677 CHECK_OFFSET(guest_sysenter_esp, 568);
678 CHECK_OFFSET(guest_sysenter_eip, 576);
679 CHECK_OFFSET(host_cr0, 584);
680 CHECK_OFFSET(host_cr3, 592);
681 CHECK_OFFSET(host_cr4, 600);
682 CHECK_OFFSET(host_fs_base, 608);
683 CHECK_OFFSET(host_gs_base, 616);
684 CHECK_OFFSET(host_tr_base, 624);
685 CHECK_OFFSET(host_gdtr_base, 632);
686 CHECK_OFFSET(host_idtr_base, 640);
687 CHECK_OFFSET(host_ia32_sysenter_esp, 648);
688 CHECK_OFFSET(host_ia32_sysenter_eip, 656);
689 CHECK_OFFSET(host_rsp, 664);
690 CHECK_OFFSET(host_rip, 672);
691 CHECK_OFFSET(pin_based_vm_exec_control, 744);
692 CHECK_OFFSET(cpu_based_vm_exec_control, 748);
693 CHECK_OFFSET(exception_bitmap, 752);
694 CHECK_OFFSET(page_fault_error_code_mask, 756);
695 CHECK_OFFSET(page_fault_error_code_match, 760);
696 CHECK_OFFSET(cr3_target_count, 764);
697 CHECK_OFFSET(vm_exit_controls, 768);
698 CHECK_OFFSET(vm_exit_msr_store_count, 772);
699 CHECK_OFFSET(vm_exit_msr_load_count, 776);
700 CHECK_OFFSET(vm_entry_controls, 780);
701 CHECK_OFFSET(vm_entry_msr_load_count, 784);
702 CHECK_OFFSET(vm_entry_intr_info_field, 788);
703 CHECK_OFFSET(vm_entry_exception_error_code, 792);
704 CHECK_OFFSET(vm_entry_instruction_len, 796);
705 CHECK_OFFSET(tpr_threshold, 800);
706 CHECK_OFFSET(secondary_vm_exec_control, 804);
707 CHECK_OFFSET(vm_instruction_error, 808);
708 CHECK_OFFSET(vm_exit_reason, 812);
709 CHECK_OFFSET(vm_exit_intr_info, 816);
710 CHECK_OFFSET(vm_exit_intr_error_code, 820);
711 CHECK_OFFSET(idt_vectoring_info_field, 824);
712 CHECK_OFFSET(idt_vectoring_error_code, 828);
713 CHECK_OFFSET(vm_exit_instruction_len, 832);
714 CHECK_OFFSET(vmx_instruction_info, 836);
715 CHECK_OFFSET(guest_es_limit, 840);
716 CHECK_OFFSET(guest_cs_limit, 844);
717 CHECK_OFFSET(guest_ss_limit, 848);
718 CHECK_OFFSET(guest_ds_limit, 852);
719 CHECK_OFFSET(guest_fs_limit, 856);
720 CHECK_OFFSET(guest_gs_limit, 860);
721 CHECK_OFFSET(guest_ldtr_limit, 864);
722 CHECK_OFFSET(guest_tr_limit, 868);
723 CHECK_OFFSET(guest_gdtr_limit, 872);
724 CHECK_OFFSET(guest_idtr_limit, 876);
725 CHECK_OFFSET(guest_es_ar_bytes, 880);
726 CHECK_OFFSET(guest_cs_ar_bytes, 884);
727 CHECK_OFFSET(guest_ss_ar_bytes, 888);
728 CHECK_OFFSET(guest_ds_ar_bytes, 892);
729 CHECK_OFFSET(guest_fs_ar_bytes, 896);
730 CHECK_OFFSET(guest_gs_ar_bytes, 900);
731 CHECK_OFFSET(guest_ldtr_ar_bytes, 904);
732 CHECK_OFFSET(guest_tr_ar_bytes, 908);
733 CHECK_OFFSET(guest_interruptibility_info, 912);
734 CHECK_OFFSET(guest_activity_state, 916);
735 CHECK_OFFSET(guest_sysenter_cs, 920);
736 CHECK_OFFSET(host_ia32_sysenter_cs, 924);
737 CHECK_OFFSET(vmx_preemption_timer_value, 928);
738 CHECK_OFFSET(virtual_processor_id, 960);
739 CHECK_OFFSET(posted_intr_nv, 962);
740 CHECK_OFFSET(guest_es_selector, 964);
741 CHECK_OFFSET(guest_cs_selector, 966);
742 CHECK_OFFSET(guest_ss_selector, 968);
743 CHECK_OFFSET(guest_ds_selector, 970);
744 CHECK_OFFSET(guest_fs_selector, 972);
745 CHECK_OFFSET(guest_gs_selector, 974);
746 CHECK_OFFSET(guest_ldtr_selector, 976);
747 CHECK_OFFSET(guest_tr_selector, 978);
748 CHECK_OFFSET(guest_intr_status, 980);
749 CHECK_OFFSET(host_es_selector, 982);
750 CHECK_OFFSET(host_cs_selector, 984);
751 CHECK_OFFSET(host_ss_selector, 986);
752 CHECK_OFFSET(host_ds_selector, 988);
753 CHECK_OFFSET(host_fs_selector, 990);
754 CHECK_OFFSET(host_gs_selector, 992);
755 CHECK_OFFSET(host_tr_selector, 994);
756 CHECK_OFFSET(guest_pml_index, 996);
757}
758
759/*
760 * VMCS12_REVISION is an arbitrary id that should be changed if the content or
761 * layout of struct vmcs12 is changed. MSR_IA32_VMX_BASIC returns this id, and
762 * VMPTRLD verifies that the VMCS region that L1 is loading contains this id.
763 *
764 * IMPORTANT: Changing this value will break save/restore compatibility with
765 * older kvm releases.
766 */
767#define VMCS12_REVISION 0x11e57ed0
768
769/*
770 * VMCS12_SIZE is the number of bytes L1 should allocate for the VMXON region
771 * and any VMCS region. Although only sizeof(struct vmcs12) are used by the
772 * current implementation, 4K are reserved to avoid future complications.
773 */
774#define VMCS12_SIZE 0x1000
775
776/*
777 * VMCS12_MAX_FIELD_INDEX is the highest index value used in any
778 * supported VMCS12 field encoding.
779 */
780#define VMCS12_MAX_FIELD_INDEX 0x17
781
782struct nested_vmx_msrs {
783 /*
784 * We only store the "true" versions of the VMX capability MSRs. We
785 * generate the "non-true" versions by setting the must-be-1 bits
786 * according to the SDM.
787 */
788 u32 procbased_ctls_low;
789 u32 procbased_ctls_high;
790 u32 secondary_ctls_low;
791 u32 secondary_ctls_high;
792 u32 pinbased_ctls_low;
793 u32 pinbased_ctls_high;
794 u32 exit_ctls_low;
795 u32 exit_ctls_high;
796 u32 entry_ctls_low;
797 u32 entry_ctls_high;
798 u32 misc_low;
799 u32 misc_high;
800 u32 ept_caps;
801 u32 vpid_caps;
802 u64 basic;
803 u64 cr0_fixed0;
804 u64 cr0_fixed1;
805 u64 cr4_fixed0;
806 u64 cr4_fixed1;
807 u64 vmcs_enum;
808 u64 vmfunc_controls;
809};
810
811/*
812 * The nested_vmx structure is part of vcpu_vmx, and holds information we need
813 * for correct emulation of VMX (i.e., nested VMX) on this vcpu.
814 */
815struct nested_vmx {
816 /* Has the level1 guest done vmxon? */
817 bool vmxon;
818 gpa_t vmxon_ptr;
819 bool pml_full;
820
821 /* The guest-physical address of the current VMCS L1 keeps for L2 */
822 gpa_t current_vmptr;
823 /*
824 * Cache of the guest's VMCS, existing outside of guest memory.
825 * Loaded from guest memory during VMPTRLD. Flushed to guest
826 * memory during VMCLEAR and VMPTRLD.
827 */
828 struct vmcs12 *cached_vmcs12;
829 /*
830 * Cache of the guest's shadow VMCS, existing outside of guest
831 * memory. Loaded from guest memory during VM entry. Flushed
832 * to guest memory during VM exit.
833 */
834 struct vmcs12 *cached_shadow_vmcs12;
835 /*
836 * Indicates if the shadow vmcs or enlightened vmcs must be updated
837 * with the data held by struct vmcs12.
838 */
839 bool need_vmcs12_sync;
840 bool dirty_vmcs12;
841
842 /*
843 * vmcs02 has been initialized, i.e. state that is constant for
844 * vmcs02 has been written to the backing VMCS. Initialization
845 * is delayed until L1 actually attempts to run a nested VM.
846 */
847 bool vmcs02_initialized;
848
849 bool change_vmcs01_virtual_apic_mode;
850
851 /*
852 * Enlightened VMCS has been enabled. It does not mean that L1 has to
853 * use it. However, VMX features available to L1 will be limited based
854 * on what the enlightened VMCS supports.
855 */
856 bool enlightened_vmcs_enabled;
857
858 /* L2 must run next, and mustn't decide to exit to L1. */
859 bool nested_run_pending;
860
861 struct loaded_vmcs vmcs02;
862
863 /*
864 * Guest pages referred to in the vmcs02 with host-physical
865 * pointers, so we must keep them pinned while L2 runs.
866 */
867 struct page *apic_access_page;
868 struct page *virtual_apic_page;
869 struct page *pi_desc_page;
870 struct pi_desc *pi_desc;
871 bool pi_pending;
872 u16 posted_intr_nv;
873
874 struct hrtimer preemption_timer;
875 bool preemption_timer_expired;
876
877 /* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */
878 u64 vmcs01_debugctl;
879 u64 vmcs01_guest_bndcfgs;
880
881 u16 vpid02;
882 u16 last_vpid;
883
884 struct nested_vmx_msrs msrs;
885
886 /* SMM related state */
887 struct {
888 /* in VMX operation on SMM entry? */
889 bool vmxon;
890 /* in guest mode on SMM entry? */
891 bool guest_mode;
892 } smm;
893
894 gpa_t hv_evmcs_vmptr;
895 struct page *hv_evmcs_page;
896 struct hv_enlightened_vmcs *hv_evmcs;
897};
898
899#define POSTED_INTR_ON 0
900#define POSTED_INTR_SN 1
901
902/* Posted-Interrupt Descriptor */
903struct pi_desc {
904 u32 pir[8]; /* Posted interrupt requested */
905 union {
906 struct {
907 /* bit 256 - Outstanding Notification */
908 u16 on : 1,
909 /* bit 257 - Suppress Notification */
910 sn : 1,
911 /* bit 271:258 - Reserved */
912 rsvd_1 : 14;
913 /* bit 279:272 - Notification Vector */
914 u8 nv;
915 /* bit 287:280 - Reserved */
916 u8 rsvd_2;
917 /* bit 319:288 - Notification Destination */
918 u32 ndst;
919 };
920 u64 control;
921 };
922 u32 rsvd[6];
923} __aligned(64);
924
925static bool pi_test_and_set_on(struct pi_desc *pi_desc)
926{
927 return test_and_set_bit(POSTED_INTR_ON,
928 (unsigned long *)&pi_desc->control);
929}
930
931static bool pi_test_and_clear_on(struct pi_desc *pi_desc)
932{
933 return test_and_clear_bit(POSTED_INTR_ON,
934 (unsigned long *)&pi_desc->control);
935}
936
937static int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
938{
939 return test_and_set_bit(vector, (unsigned long *)pi_desc->pir);
940}
941
942static inline void pi_clear_sn(struct pi_desc *pi_desc)
943{
944 return clear_bit(POSTED_INTR_SN,
945 (unsigned long *)&pi_desc->control);
946}
947
948static inline void pi_set_sn(struct pi_desc *pi_desc)
949{
950 return set_bit(POSTED_INTR_SN,
951 (unsigned long *)&pi_desc->control);
952}
953
954static inline void pi_clear_on(struct pi_desc *pi_desc)
955{
956 clear_bit(POSTED_INTR_ON,
957 (unsigned long *)&pi_desc->control);
958}
959
960static inline int pi_test_on(struct pi_desc *pi_desc)
961{
962 return test_bit(POSTED_INTR_ON,
963 (unsigned long *)&pi_desc->control);
964}
965
966static inline int pi_test_sn(struct pi_desc *pi_desc)
967{
968 return test_bit(POSTED_INTR_SN,
969 (unsigned long *)&pi_desc->control);
970}
971
972struct vmx_msrs {
973 unsigned int nr;
974 struct vmx_msr_entry val[NR_AUTOLOAD_MSRS];
975};
976
977struct vcpu_vmx {
978 struct kvm_vcpu vcpu;
979 unsigned long host_rsp;
980 u8 fail;
981 u8 msr_bitmap_mode;
982 u32 exit_intr_info;
983 u32 idt_vectoring_info;
984 ulong rflags;
985 struct shared_msr_entry *guest_msrs;
986 int nmsrs;
987 int save_nmsrs;
988 bool guest_msrs_dirty;
989 unsigned long host_idt_base;
990#ifdef CONFIG_X86_64
991 u64 msr_host_kernel_gs_base;
992 u64 msr_guest_kernel_gs_base;
993#endif
994
995 u64 arch_capabilities;
996 u64 spec_ctrl;
997
998 u32 vm_entry_controls_shadow;
999 u32 vm_exit_controls_shadow;
1000 u32 secondary_exec_control;
1001
1002 /*
1003 * loaded_vmcs points to the VMCS currently used in this vcpu. For a
1004 * non-nested (L1) guest, it always points to vmcs01. For a nested
1005 * guest (L2), it points to a different VMCS. loaded_cpu_state points
1006 * to the VMCS whose state is loaded into the CPU registers that only
1007 * need to be switched when transitioning to/from the kernel; a NULL
1008 * value indicates that host state is loaded.
1009 */
1010 struct loaded_vmcs vmcs01;
1011 struct loaded_vmcs *loaded_vmcs;
1012 struct loaded_vmcs *loaded_cpu_state;
1013 bool __launched; /* temporary, used in vmx_vcpu_run */
1014 struct msr_autoload {
1015 struct vmx_msrs guest;
1016 struct vmx_msrs host;
1017 } msr_autoload;
1018
1019 struct {
1020 int vm86_active;
1021 ulong save_rflags;
1022 struct kvm_segment segs[8];
1023 } rmode;
1024 struct {
1025 u32 bitmask; /* 4 bits per segment (1 bit per field) */
1026 struct kvm_save_segment {
1027 u16 selector;
1028 unsigned long base;
1029 u32 limit;
1030 u32 ar;
1031 } seg[8];
1032 } segment_cache;
1033 int vpid;
1034 bool emulation_required;
1035
1036 u32 exit_reason;
1037
1038 /* Posted interrupt descriptor */
1039 struct pi_desc pi_desc;
1040
1041 /* Support for a guest hypervisor (nested VMX) */
1042 struct nested_vmx nested;
1043
1044 /* Dynamic PLE window. */
1045 int ple_window;
1046 bool ple_window_dirty;
1047
1048 bool req_immediate_exit;
1049
1050 /* Support for PML */
1051#define PML_ENTITY_NUM 512
1052 struct page *pml_pg;
1053
1054 /* apic deadline value in host tsc */
1055 u64 hv_deadline_tsc;
1056
1057 u64 current_tsc_ratio;
1058
1059 u32 host_pkru;
1060
1061 unsigned long host_debugctlmsr;
1062
1063 /*
1064 * Only bits masked by msr_ia32_feature_control_valid_bits can be set in
1065 * msr_ia32_feature_control. FEATURE_CONTROL_LOCKED is always included
1066 * in msr_ia32_feature_control_valid_bits.
1067 */
1068 u64 msr_ia32_feature_control;
1069 u64 msr_ia32_feature_control_valid_bits;
1070 u64 ept_pointer;
1071};
1072
1073enum segment_cache_field {
1074 SEG_FIELD_SEL = 0,
1075 SEG_FIELD_BASE = 1,
1076 SEG_FIELD_LIMIT = 2,
1077 SEG_FIELD_AR = 3,
1078
1079 SEG_FIELD_NR = 4
1080};
1081
1082static inline struct kvm_vmx *to_kvm_vmx(struct kvm *kvm)
1083{
1084 return container_of(kvm, struct kvm_vmx, kvm);
1085}
1086
1087static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
1088{
1089 return container_of(vcpu, struct vcpu_vmx, vcpu);
1090}
1091
1092static struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu)
1093{
1094 return &(to_vmx(vcpu)->pi_desc);
1095}
1096
1097#define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n)))))
1098#define VMCS12_OFFSET(x) offsetof(struct vmcs12, x)
1099#define FIELD(number, name) [ROL16(number, 6)] = VMCS12_OFFSET(name)
1100#define FIELD64(number, name) \
1101 FIELD(number, name), \
1102 [ROL16(number##_HIGH, 6)] = VMCS12_OFFSET(name) + sizeof(u32)
1103
1104
1105static u16 shadow_read_only_fields[] = {
1106#define SHADOW_FIELD_RO(x) x,
1107#include "vmx_shadow_fields.h"
1108};
1109static int max_shadow_read_only_fields =
1110 ARRAY_SIZE(shadow_read_only_fields);
1111
1112static u16 shadow_read_write_fields[] = {
1113#define SHADOW_FIELD_RW(x) x,
1114#include "vmx_shadow_fields.h"
1115};
1116static int max_shadow_read_write_fields =
1117 ARRAY_SIZE(shadow_read_write_fields);
1118
1119static const unsigned short vmcs_field_to_offset_table[] = {
1120 FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id),
1121 FIELD(POSTED_INTR_NV, posted_intr_nv),
1122 FIELD(GUEST_ES_SELECTOR, guest_es_selector),
1123 FIELD(GUEST_CS_SELECTOR, guest_cs_selector),
1124 FIELD(GUEST_SS_SELECTOR, guest_ss_selector),
1125 FIELD(GUEST_DS_SELECTOR, guest_ds_selector),
1126 FIELD(GUEST_FS_SELECTOR, guest_fs_selector),
1127 FIELD(GUEST_GS_SELECTOR, guest_gs_selector),
1128 FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector),
1129 FIELD(GUEST_TR_SELECTOR, guest_tr_selector),
1130 FIELD(GUEST_INTR_STATUS, guest_intr_status),
1131 FIELD(GUEST_PML_INDEX, guest_pml_index),
1132 FIELD(HOST_ES_SELECTOR, host_es_selector),
1133 FIELD(HOST_CS_SELECTOR, host_cs_selector),
1134 FIELD(HOST_SS_SELECTOR, host_ss_selector),
1135 FIELD(HOST_DS_SELECTOR, host_ds_selector),
1136 FIELD(HOST_FS_SELECTOR, host_fs_selector),
1137 FIELD(HOST_GS_SELECTOR, host_gs_selector),
1138 FIELD(HOST_TR_SELECTOR, host_tr_selector),
1139 FIELD64(IO_BITMAP_A, io_bitmap_a),
1140 FIELD64(IO_BITMAP_B, io_bitmap_b),
1141 FIELD64(MSR_BITMAP, msr_bitmap),
1142 FIELD64(VM_EXIT_MSR_STORE_ADDR, vm_exit_msr_store_addr),
1143 FIELD64(VM_EXIT_MSR_LOAD_ADDR, vm_exit_msr_load_addr),
1144 FIELD64(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr),
1145 FIELD64(PML_ADDRESS, pml_address),
1146 FIELD64(TSC_OFFSET, tsc_offset),
1147 FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr),
1148 FIELD64(APIC_ACCESS_ADDR, apic_access_addr),
1149 FIELD64(POSTED_INTR_DESC_ADDR, posted_intr_desc_addr),
1150 FIELD64(VM_FUNCTION_CONTROL, vm_function_control),
1151 FIELD64(EPT_POINTER, ept_pointer),
1152 FIELD64(EOI_EXIT_BITMAP0, eoi_exit_bitmap0),
1153 FIELD64(EOI_EXIT_BITMAP1, eoi_exit_bitmap1),
1154 FIELD64(EOI_EXIT_BITMAP2, eoi_exit_bitmap2),
1155 FIELD64(EOI_EXIT_BITMAP3, eoi_exit_bitmap3),
1156 FIELD64(EPTP_LIST_ADDRESS, eptp_list_address),
1157 FIELD64(VMREAD_BITMAP, vmread_bitmap),
1158 FIELD64(VMWRITE_BITMAP, vmwrite_bitmap),
1159 FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap),
1160 FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address),
1161 FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer),
1162 FIELD64(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl),
1163 FIELD64(GUEST_IA32_PAT, guest_ia32_pat),
1164 FIELD64(GUEST_IA32_EFER, guest_ia32_efer),
1165 FIELD64(GUEST_IA32_PERF_GLOBAL_CTRL, guest_ia32_perf_global_ctrl),
1166 FIELD64(GUEST_PDPTR0, guest_pdptr0),
1167 FIELD64(GUEST_PDPTR1, guest_pdptr1),
1168 FIELD64(GUEST_PDPTR2, guest_pdptr2),
1169 FIELD64(GUEST_PDPTR3, guest_pdptr3),
1170 FIELD64(GUEST_BNDCFGS, guest_bndcfgs),
1171 FIELD64(HOST_IA32_PAT, host_ia32_pat),
1172 FIELD64(HOST_IA32_EFER, host_ia32_efer),
1173 FIELD64(HOST_IA32_PERF_GLOBAL_CTRL, host_ia32_perf_global_ctrl),
1174 FIELD(PIN_BASED_VM_EXEC_CONTROL, pin_based_vm_exec_control),
1175 FIELD(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control),
1176 FIELD(EXCEPTION_BITMAP, exception_bitmap),
1177 FIELD(PAGE_FAULT_ERROR_CODE_MASK, page_fault_error_code_mask),
1178 FIELD(PAGE_FAULT_ERROR_CODE_MATCH, page_fault_error_code_match),
1179 FIELD(CR3_TARGET_COUNT, cr3_target_count),
1180 FIELD(VM_EXIT_CONTROLS, vm_exit_controls),
1181 FIELD(VM_EXIT_MSR_STORE_COUNT, vm_exit_msr_store_count),
1182 FIELD(VM_EXIT_MSR_LOAD_COUNT, vm_exit_msr_load_count),
1183 FIELD(VM_ENTRY_CONTROLS, vm_entry_controls),
1184 FIELD(VM_ENTRY_MSR_LOAD_COUNT, vm_entry_msr_load_count),
1185 FIELD(VM_ENTRY_INTR_INFO_FIELD, vm_entry_intr_info_field),
1186 FIELD(VM_ENTRY_EXCEPTION_ERROR_CODE, vm_entry_exception_error_code),
1187 FIELD(VM_ENTRY_INSTRUCTION_LEN, vm_entry_instruction_len),
1188 FIELD(TPR_THRESHOLD, tpr_threshold),
1189 FIELD(SECONDARY_VM_EXEC_CONTROL, secondary_vm_exec_control),
1190 FIELD(VM_INSTRUCTION_ERROR, vm_instruction_error),
1191 FIELD(VM_EXIT_REASON, vm_exit_reason),
1192 FIELD(VM_EXIT_INTR_INFO, vm_exit_intr_info),
1193 FIELD(VM_EXIT_INTR_ERROR_CODE, vm_exit_intr_error_code),
1194 FIELD(IDT_VECTORING_INFO_FIELD, idt_vectoring_info_field),
1195 FIELD(IDT_VECTORING_ERROR_CODE, idt_vectoring_error_code),
1196 FIELD(VM_EXIT_INSTRUCTION_LEN, vm_exit_instruction_len),
1197 FIELD(VMX_INSTRUCTION_INFO, vmx_instruction_info),
1198 FIELD(GUEST_ES_LIMIT, guest_es_limit),
1199 FIELD(GUEST_CS_LIMIT, guest_cs_limit),
1200 FIELD(GUEST_SS_LIMIT, guest_ss_limit),
1201 FIELD(GUEST_DS_LIMIT, guest_ds_limit),
1202 FIELD(GUEST_FS_LIMIT, guest_fs_limit),
1203 FIELD(GUEST_GS_LIMIT, guest_gs_limit),
1204 FIELD(GUEST_LDTR_LIMIT, guest_ldtr_limit),
1205 FIELD(GUEST_TR_LIMIT, guest_tr_limit),
1206 FIELD(GUEST_GDTR_LIMIT, guest_gdtr_limit),
1207 FIELD(GUEST_IDTR_LIMIT, guest_idtr_limit),
1208 FIELD(GUEST_ES_AR_BYTES, guest_es_ar_bytes),
1209 FIELD(GUEST_CS_AR_BYTES, guest_cs_ar_bytes),
1210 FIELD(GUEST_SS_AR_BYTES, guest_ss_ar_bytes),
1211 FIELD(GUEST_DS_AR_BYTES, guest_ds_ar_bytes),
1212 FIELD(GUEST_FS_AR_BYTES, guest_fs_ar_bytes),
1213 FIELD(GUEST_GS_AR_BYTES, guest_gs_ar_bytes),
1214 FIELD(GUEST_LDTR_AR_BYTES, guest_ldtr_ar_bytes),
1215 FIELD(GUEST_TR_AR_BYTES, guest_tr_ar_bytes),
1216 FIELD(GUEST_INTERRUPTIBILITY_INFO, guest_interruptibility_info),
1217 FIELD(GUEST_ACTIVITY_STATE, guest_activity_state),
1218 FIELD(GUEST_SYSENTER_CS, guest_sysenter_cs),
1219 FIELD(HOST_IA32_SYSENTER_CS, host_ia32_sysenter_cs),
1220 FIELD(VMX_PREEMPTION_TIMER_VALUE, vmx_preemption_timer_value),
1221 FIELD(CR0_GUEST_HOST_MASK, cr0_guest_host_mask),
1222 FIELD(CR4_GUEST_HOST_MASK, cr4_guest_host_mask),
1223 FIELD(CR0_READ_SHADOW, cr0_read_shadow),
1224 FIELD(CR4_READ_SHADOW, cr4_read_shadow),
1225 FIELD(CR3_TARGET_VALUE0, cr3_target_value0),
1226 FIELD(CR3_TARGET_VALUE1, cr3_target_value1),
1227 FIELD(CR3_TARGET_VALUE2, cr3_target_value2),
1228 FIELD(CR3_TARGET_VALUE3, cr3_target_value3),
1229 FIELD(EXIT_QUALIFICATION, exit_qualification),
1230 FIELD(GUEST_LINEAR_ADDRESS, guest_linear_address),
1231 FIELD(GUEST_CR0, guest_cr0),
1232 FIELD(GUEST_CR3, guest_cr3),
1233 FIELD(GUEST_CR4, guest_cr4),
1234 FIELD(GUEST_ES_BASE, guest_es_base),
1235 FIELD(GUEST_CS_BASE, guest_cs_base),
1236 FIELD(GUEST_SS_BASE, guest_ss_base),
1237 FIELD(GUEST_DS_BASE, guest_ds_base),
1238 FIELD(GUEST_FS_BASE, guest_fs_base),
1239 FIELD(GUEST_GS_BASE, guest_gs_base),
1240 FIELD(GUEST_LDTR_BASE, guest_ldtr_base),
1241 FIELD(GUEST_TR_BASE, guest_tr_base),
1242 FIELD(GUEST_GDTR_BASE, guest_gdtr_base),
1243 FIELD(GUEST_IDTR_BASE, guest_idtr_base),
1244 FIELD(GUEST_DR7, guest_dr7),
1245 FIELD(GUEST_RSP, guest_rsp),
1246 FIELD(GUEST_RIP, guest_rip),
1247 FIELD(GUEST_RFLAGS, guest_rflags),
1248 FIELD(GUEST_PENDING_DBG_EXCEPTIONS, guest_pending_dbg_exceptions),
1249 FIELD(GUEST_SYSENTER_ESP, guest_sysenter_esp),
1250 FIELD(GUEST_SYSENTER_EIP, guest_sysenter_eip),
1251 FIELD(HOST_CR0, host_cr0),
1252 FIELD(HOST_CR3, host_cr3),
1253 FIELD(HOST_CR4, host_cr4),
1254 FIELD(HOST_FS_BASE, host_fs_base),
1255 FIELD(HOST_GS_BASE, host_gs_base),
1256 FIELD(HOST_TR_BASE, host_tr_base),
1257 FIELD(HOST_GDTR_BASE, host_gdtr_base),
1258 FIELD(HOST_IDTR_BASE, host_idtr_base),
1259 FIELD(HOST_IA32_SYSENTER_ESP, host_ia32_sysenter_esp),
1260 FIELD(HOST_IA32_SYSENTER_EIP, host_ia32_sysenter_eip),
1261 FIELD(HOST_RSP, host_rsp),
1262 FIELD(HOST_RIP, host_rip),
1263};
1264
1265static inline short vmcs_field_to_offset(unsigned long field)
1266{
1267 const size_t size = ARRAY_SIZE(vmcs_field_to_offset_table);
1268 unsigned short offset;
1269 unsigned index;
1270
1271 if (field >> 15)
1272 return -ENOENT;
1273
1274 index = ROL16(field, 6);
1275 if (index >= size)
1276 return -ENOENT;
1277
1278 index = array_index_nospec(index, size);
1279 offset = vmcs_field_to_offset_table[index];
1280 if (offset == 0)
1281 return -ENOENT;
1282 return offset;
1283}
1284
1285static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
1286{
1287 return to_vmx(vcpu)->nested.cached_vmcs12;
1288}
1289
1290static inline struct vmcs12 *get_shadow_vmcs12(struct kvm_vcpu *vcpu)
1291{
1292 return to_vmx(vcpu)->nested.cached_shadow_vmcs12;
1293}
1294
1295static bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu);
1296static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu);
1297static u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa);
1298static bool vmx_xsaves_supported(void);
1299static void vmx_set_segment(struct kvm_vcpu *vcpu,
1300 struct kvm_segment *var, int seg);
1301static void vmx_get_segment(struct kvm_vcpu *vcpu,
1302 struct kvm_segment *var, int seg);
1303static bool guest_state_valid(struct kvm_vcpu *vcpu);
1304static u32 vmx_segment_access_rights(struct kvm_segment *var);
1305static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx);
1306static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu);
1307static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked);
1308static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
1309 u16 error_code);
1310static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu);
1311static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
1312 u32 msr, int type);
1313
1314static DEFINE_PER_CPU(struct vmcs *, vmxarea);
1315static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
1316/*
1317 * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed
1318 * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it.
1319 */
1320static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
1321
1322/*
1323 * We maintian a per-CPU linked-list of vCPU, so in wakeup_handler() we
1324 * can find which vCPU should be waken up.
1325 */
1326static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu);
1327static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock);
1328
1329enum {
1330 VMX_VMREAD_BITMAP,
1331 VMX_VMWRITE_BITMAP,
1332 VMX_BITMAP_NR
1333};
1334
1335static unsigned long *vmx_bitmap[VMX_BITMAP_NR];
1336
1337#define vmx_vmread_bitmap (vmx_bitmap[VMX_VMREAD_BITMAP])
1338#define vmx_vmwrite_bitmap (vmx_bitmap[VMX_VMWRITE_BITMAP])
1339
1340static bool cpu_has_load_ia32_efer;
1341static bool cpu_has_load_perf_global_ctrl;
1342
1343static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
1344static DEFINE_SPINLOCK(vmx_vpid_lock);
1345
1346static struct vmcs_config {
1347 int size;
1348 int order;
1349 u32 basic_cap;
1350 u32 revision_id;
1351 u32 pin_based_exec_ctrl;
1352 u32 cpu_based_exec_ctrl;
1353 u32 cpu_based_2nd_exec_ctrl;
1354 u32 vmexit_ctrl;
1355 u32 vmentry_ctrl;
1356 struct nested_vmx_msrs nested;
1357} vmcs_config;
1358
1359static struct vmx_capability {
1360 u32 ept;
1361 u32 vpid;
1362} vmx_capability;
1363
1364#define VMX_SEGMENT_FIELD(seg) \
1365 [VCPU_SREG_##seg] = { \
1366 .selector = GUEST_##seg##_SELECTOR, \
1367 .base = GUEST_##seg##_BASE, \
1368 .limit = GUEST_##seg##_LIMIT, \
1369 .ar_bytes = GUEST_##seg##_AR_BYTES, \
1370 }
1371
1372static const struct kvm_vmx_segment_field {
1373 unsigned selector;
1374 unsigned base;
1375 unsigned limit;
1376 unsigned ar_bytes;
1377} kvm_vmx_segment_fields[] = {
1378 VMX_SEGMENT_FIELD(CS),
1379 VMX_SEGMENT_FIELD(DS),
1380 VMX_SEGMENT_FIELD(ES),
1381 VMX_SEGMENT_FIELD(FS),
1382 VMX_SEGMENT_FIELD(GS),
1383 VMX_SEGMENT_FIELD(SS),
1384 VMX_SEGMENT_FIELD(TR),
1385 VMX_SEGMENT_FIELD(LDTR),
1386};
1387
1388static u64 host_efer;
1389
1390static void ept_save_pdptrs(struct kvm_vcpu *vcpu);
1391
1392/*
1393 * Keep MSR_STAR at the end, as setup_msrs() will try to optimize it
1394 * away by decrementing the array size.
1395 */
1396static const u32 vmx_msr_index[] = {
1397#ifdef CONFIG_X86_64
1398 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
1399#endif
1400 MSR_EFER, MSR_TSC_AUX, MSR_STAR,
1401};
1402
1403DEFINE_STATIC_KEY_FALSE(enable_evmcs);
1404
1405#define current_evmcs ((struct hv_enlightened_vmcs *)this_cpu_read(current_vmcs))
1406
1407#define KVM_EVMCS_VERSION 1
1408
1409/*
1410 * Enlightened VMCSv1 doesn't support these:
1411 *
1412 * POSTED_INTR_NV = 0x00000002,
1413 * GUEST_INTR_STATUS = 0x00000810,
1414 * APIC_ACCESS_ADDR = 0x00002014,
1415 * POSTED_INTR_DESC_ADDR = 0x00002016,
1416 * EOI_EXIT_BITMAP0 = 0x0000201c,
1417 * EOI_EXIT_BITMAP1 = 0x0000201e,
1418 * EOI_EXIT_BITMAP2 = 0x00002020,
1419 * EOI_EXIT_BITMAP3 = 0x00002022,
1420 * GUEST_PML_INDEX = 0x00000812,
1421 * PML_ADDRESS = 0x0000200e,
1422 * VM_FUNCTION_CONTROL = 0x00002018,
1423 * EPTP_LIST_ADDRESS = 0x00002024,
1424 * VMREAD_BITMAP = 0x00002026,
1425 * VMWRITE_BITMAP = 0x00002028,
1426 *
1427 * TSC_MULTIPLIER = 0x00002032,
1428 * PLE_GAP = 0x00004020,
1429 * PLE_WINDOW = 0x00004022,
1430 * VMX_PREEMPTION_TIMER_VALUE = 0x0000482E,
1431 * GUEST_IA32_PERF_GLOBAL_CTRL = 0x00002808,
1432 * HOST_IA32_PERF_GLOBAL_CTRL = 0x00002c04,
1433 *
1434 * Currently unsupported in KVM:
1435 * GUEST_IA32_RTIT_CTL = 0x00002814,
1436 */
1437#define EVMCS1_UNSUPPORTED_PINCTRL (PIN_BASED_POSTED_INTR | \
1438 PIN_BASED_VMX_PREEMPTION_TIMER)
1439#define EVMCS1_UNSUPPORTED_2NDEXEC \
1440 (SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | \
1441 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | \
1442 SECONDARY_EXEC_APIC_REGISTER_VIRT | \
1443 SECONDARY_EXEC_ENABLE_PML | \
1444 SECONDARY_EXEC_ENABLE_VMFUNC | \
1445 SECONDARY_EXEC_SHADOW_VMCS | \
1446 SECONDARY_EXEC_TSC_SCALING | \
1447 SECONDARY_EXEC_PAUSE_LOOP_EXITING)
1448#define EVMCS1_UNSUPPORTED_VMEXIT_CTRL (VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
1449#define EVMCS1_UNSUPPORTED_VMENTRY_CTRL (VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
1450#define EVMCS1_UNSUPPORTED_VMFUNC (VMX_VMFUNC_EPTP_SWITCHING)
1451
1452#if IS_ENABLED(CONFIG_HYPERV)
1453static bool __read_mostly enlightened_vmcs = true;
1454module_param(enlightened_vmcs, bool, 0444);
1455
1456static inline void evmcs_write64(unsigned long field, u64 value)
1457{
1458 u16 clean_field;
1459 int offset = get_evmcs_offset(field, &clean_field);
1460
1461 if (offset < 0)
1462 return;
1463
1464 *(u64 *)((char *)current_evmcs + offset) = value;
1465
1466 current_evmcs->hv_clean_fields &= ~clean_field;
1467}
1468
1469static inline void evmcs_write32(unsigned long field, u32 value)
1470{
1471 u16 clean_field;
1472 int offset = get_evmcs_offset(field, &clean_field);
1473
1474 if (offset < 0)
1475 return;
1476
1477 *(u32 *)((char *)current_evmcs + offset) = value;
1478 current_evmcs->hv_clean_fields &= ~clean_field;
1479}
1480
1481static inline void evmcs_write16(unsigned long field, u16 value)
1482{
1483 u16 clean_field;
1484 int offset = get_evmcs_offset(field, &clean_field);
1485
1486 if (offset < 0)
1487 return;
1488
1489 *(u16 *)((char *)current_evmcs + offset) = value;
1490 current_evmcs->hv_clean_fields &= ~clean_field;
1491}
1492
1493static inline u64 evmcs_read64(unsigned long field)
1494{
1495 int offset = get_evmcs_offset(field, NULL);
1496
1497 if (offset < 0)
1498 return 0;
1499
1500 return *(u64 *)((char *)current_evmcs + offset);
1501}
1502
1503static inline u32 evmcs_read32(unsigned long field)
1504{
1505 int offset = get_evmcs_offset(field, NULL);
1506
1507 if (offset < 0)
1508 return 0;
1509
1510 return *(u32 *)((char *)current_evmcs + offset);
1511}
1512
1513static inline u16 evmcs_read16(unsigned long field)
1514{
1515 int offset = get_evmcs_offset(field, NULL);
1516
1517 if (offset < 0)
1518 return 0;
1519
1520 return *(u16 *)((char *)current_evmcs + offset);
1521}
1522
1523static inline void evmcs_touch_msr_bitmap(void)
1524{
1525 if (unlikely(!current_evmcs))
1526 return;
1527
1528 if (current_evmcs->hv_enlightenments_control.msr_bitmap)
1529 current_evmcs->hv_clean_fields &=
1530 ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP;
1531}
1532
1533static void evmcs_load(u64 phys_addr)
1534{
1535 struct hv_vp_assist_page *vp_ap =
1536 hv_get_vp_assist_page(smp_processor_id());
1537
1538 vp_ap->current_nested_vmcs = phys_addr;
1539 vp_ap->enlighten_vmentry = 1;
1540}
1541
1542static void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf)
1543{
1544 vmcs_conf->pin_based_exec_ctrl &= ~EVMCS1_UNSUPPORTED_PINCTRL;
1545 vmcs_conf->cpu_based_2nd_exec_ctrl &= ~EVMCS1_UNSUPPORTED_2NDEXEC;
1546
1547 vmcs_conf->vmexit_ctrl &= ~EVMCS1_UNSUPPORTED_VMEXIT_CTRL;
1548 vmcs_conf->vmentry_ctrl &= ~EVMCS1_UNSUPPORTED_VMENTRY_CTRL;
1549
1550}
1551
1552/* check_ept_pointer() should be under protection of ept_pointer_lock. */
1553static void check_ept_pointer_match(struct kvm *kvm)
1554{
1555 struct kvm_vcpu *vcpu;
1556 u64 tmp_eptp = INVALID_PAGE;
1557 int i;
1558
1559 kvm_for_each_vcpu(i, vcpu, kvm) {
1560 if (!VALID_PAGE(tmp_eptp)) {
1561 tmp_eptp = to_vmx(vcpu)->ept_pointer;
1562 } else if (tmp_eptp != to_vmx(vcpu)->ept_pointer) {
1563 to_kvm_vmx(kvm)->ept_pointers_match
1564 = EPT_POINTERS_MISMATCH;
1565 return;
1566 }
1567 }
1568
1569 to_kvm_vmx(kvm)->ept_pointers_match = EPT_POINTERS_MATCH;
1570}
1571
1572static int vmx_hv_remote_flush_tlb(struct kvm *kvm)
1573{
1574 struct kvm_vcpu *vcpu;
1575 int ret = -ENOTSUPP, i;
1576
1577 spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock);
1578
1579 if (to_kvm_vmx(kvm)->ept_pointers_match == EPT_POINTERS_CHECK)
1580 check_ept_pointer_match(kvm);
1581
1582 /*
1583 * FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE hypercall needs the address of the
1584 * base of EPT PML4 table, strip off EPT configuration information.
1585 */
1586 if (to_kvm_vmx(kvm)->ept_pointers_match != EPT_POINTERS_MATCH) {
1587 kvm_for_each_vcpu(i, vcpu, kvm)
1588 ret |= hyperv_flush_guest_mapping(
1589 to_vmx(kvm_get_vcpu(kvm, i))->ept_pointer & PAGE_MASK);
1590 } else {
1591 ret = hyperv_flush_guest_mapping(
1592 to_vmx(kvm_get_vcpu(kvm, 0))->ept_pointer & PAGE_MASK);
1593 }
1594
1595 spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock);
1596 return ret;
1597}
1598#else /* !IS_ENABLED(CONFIG_HYPERV) */
1599static inline void evmcs_write64(unsigned long field, u64 value) {}
1600static inline void evmcs_write32(unsigned long field, u32 value) {}
1601static inline void evmcs_write16(unsigned long field, u16 value) {}
1602static inline u64 evmcs_read64(unsigned long field) { return 0; }
1603static inline u32 evmcs_read32(unsigned long field) { return 0; }
1604static inline u16 evmcs_read16(unsigned long field) { return 0; }
1605static inline void evmcs_load(u64 phys_addr) {}
1606static inline void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf) {}
1607static inline void evmcs_touch_msr_bitmap(void) {}
1608#endif /* IS_ENABLED(CONFIG_HYPERV) */
1609
1610static int nested_enable_evmcs(struct kvm_vcpu *vcpu,
1611 uint16_t *vmcs_version)
1612{
1613 struct vcpu_vmx *vmx = to_vmx(vcpu);
1614
1615 /*
1616 * vmcs_version represents the range of supported Enlightened VMCS
1617 * versions: lower 8 bits is the minimal version, higher 8 bits is the
1618 * maximum supported version. KVM supports versions from 1 to
1619 * KVM_EVMCS_VERSION.
1620 */
1621 if (vmcs_version)
1622 *vmcs_version = (KVM_EVMCS_VERSION << 8) | 1;
1623
1624 /* We don't support disabling the feature for simplicity. */
1625 if (vmx->nested.enlightened_vmcs_enabled)
1626 return 0;
1627
1628 vmx->nested.enlightened_vmcs_enabled = true;
1629
1630 vmx->nested.msrs.pinbased_ctls_high &= ~EVMCS1_UNSUPPORTED_PINCTRL;
1631 vmx->nested.msrs.entry_ctls_high &= ~EVMCS1_UNSUPPORTED_VMENTRY_CTRL;
1632 vmx->nested.msrs.exit_ctls_high &= ~EVMCS1_UNSUPPORTED_VMEXIT_CTRL;
1633 vmx->nested.msrs.secondary_ctls_high &= ~EVMCS1_UNSUPPORTED_2NDEXEC;
1634 vmx->nested.msrs.vmfunc_controls &= ~EVMCS1_UNSUPPORTED_VMFUNC;
1635
1636 return 0;
1637}
1638
1639static inline bool is_exception_n(u32 intr_info, u8 vector)
1640{
1641 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
1642 INTR_INFO_VALID_MASK)) ==
1643 (INTR_TYPE_HARD_EXCEPTION | vector | INTR_INFO_VALID_MASK);
1644}
1645
1646static inline bool is_debug(u32 intr_info)
1647{
1648 return is_exception_n(intr_info, DB_VECTOR);
1649}
1650
1651static inline bool is_breakpoint(u32 intr_info)
1652{
1653 return is_exception_n(intr_info, BP_VECTOR);
1654}
1655
1656static inline bool is_page_fault(u32 intr_info)
1657{
1658 return is_exception_n(intr_info, PF_VECTOR);
1659}
1660
1661static inline bool is_invalid_opcode(u32 intr_info)
1662{
1663 return is_exception_n(intr_info, UD_VECTOR);
1664}
1665
1666static inline bool is_gp_fault(u32 intr_info)
1667{
1668 return is_exception_n(intr_info, GP_VECTOR);
1669}
1670
1671static inline bool is_machine_check(u32 intr_info)
1672{
1673 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
1674 INTR_INFO_VALID_MASK)) ==
1675 (INTR_TYPE_HARD_EXCEPTION | MC_VECTOR | INTR_INFO_VALID_MASK);
1676}
1677
1678/* Undocumented: icebp/int1 */
1679static inline bool is_icebp(u32 intr_info)
1680{
1681 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
1682 == (INTR_TYPE_PRIV_SW_EXCEPTION | INTR_INFO_VALID_MASK);
1683}
1684
1685static inline bool cpu_has_vmx_msr_bitmap(void)
1686{
1687 return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS;
1688}
1689
1690static inline bool cpu_has_vmx_tpr_shadow(void)
1691{
1692 return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW;
1693}
1694
1695static inline bool cpu_need_tpr_shadow(struct kvm_vcpu *vcpu)
1696{
1697 return cpu_has_vmx_tpr_shadow() && lapic_in_kernel(vcpu);
1698}
1699
1700static inline bool cpu_has_secondary_exec_ctrls(void)
1701{
1702 return vmcs_config.cpu_based_exec_ctrl &
1703 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
1704}
1705
1706static inline bool cpu_has_vmx_virtualize_apic_accesses(void)
1707{
1708 return vmcs_config.cpu_based_2nd_exec_ctrl &
1709 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
1710}
1711
1712static inline bool cpu_has_vmx_virtualize_x2apic_mode(void)
1713{
1714 return vmcs_config.cpu_based_2nd_exec_ctrl &
1715 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
1716}
1717
1718static inline bool cpu_has_vmx_apic_register_virt(void)
1719{
1720 return vmcs_config.cpu_based_2nd_exec_ctrl &
1721 SECONDARY_EXEC_APIC_REGISTER_VIRT;
1722}
1723
1724static inline bool cpu_has_vmx_virtual_intr_delivery(void)
1725{
1726 return vmcs_config.cpu_based_2nd_exec_ctrl &
1727 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
1728}
1729
1730static inline bool cpu_has_vmx_encls_vmexit(void)
1731{
1732 return vmcs_config.cpu_based_2nd_exec_ctrl &
1733 SECONDARY_EXEC_ENCLS_EXITING;
1734}
1735
1736/*
1737 * Comment's format: document - errata name - stepping - processor name.
1738 * Refer from
1739 * https://www.virtualbox.org/svn/vbox/trunk/src/VBox/VMM/VMMR0/HMR0.cpp
1740 */
1741static u32 vmx_preemption_cpu_tfms[] = {
1742/* 323344.pdf - BA86 - D0 - Xeon 7500 Series */
17430x000206E6,
1744/* 323056.pdf - AAX65 - C2 - Xeon L3406 */
1745/* 322814.pdf - AAT59 - C2 - i7-600, i5-500, i5-400 and i3-300 Mobile */
1746/* 322911.pdf - AAU65 - C2 - i5-600, i3-500 Desktop and Pentium G6950 */
17470x00020652,
1748/* 322911.pdf - AAU65 - K0 - i5-600, i3-500 Desktop and Pentium G6950 */
17490x00020655,
1750/* 322373.pdf - AAO95 - B1 - Xeon 3400 Series */
1751/* 322166.pdf - AAN92 - B1 - i7-800 and i5-700 Desktop */
1752/*
1753 * 320767.pdf - AAP86 - B1 -
1754 * i7-900 Mobile Extreme, i7-800 and i7-700 Mobile
1755 */
17560x000106E5,
1757/* 321333.pdf - AAM126 - C0 - Xeon 3500 */
17580x000106A0,
1759/* 321333.pdf - AAM126 - C1 - Xeon 3500 */
17600x000106A1,
1761/* 320836.pdf - AAJ124 - C0 - i7-900 Desktop Extreme and i7-900 Desktop */
17620x000106A4,
1763 /* 321333.pdf - AAM126 - D0 - Xeon 3500 */
1764 /* 321324.pdf - AAK139 - D0 - Xeon 5500 */
1765 /* 320836.pdf - AAJ124 - D0 - i7-900 Extreme and i7-900 Desktop */
17660x000106A5,
1767};
1768
1769static inline bool cpu_has_broken_vmx_preemption_timer(void)
1770{
1771 u32 eax = cpuid_eax(0x00000001), i;
1772
1773 /* Clear the reserved bits */
1774 eax &= ~(0x3U << 14 | 0xfU << 28);
1775 for (i = 0; i < ARRAY_SIZE(vmx_preemption_cpu_tfms); i++)
1776 if (eax == vmx_preemption_cpu_tfms[i])
1777 return true;
1778
1779 return false;
1780}
1781
1782static inline bool cpu_has_vmx_preemption_timer(void)
1783{
1784 return vmcs_config.pin_based_exec_ctrl &
1785 PIN_BASED_VMX_PREEMPTION_TIMER;
1786}
1787
1788static inline bool cpu_has_vmx_posted_intr(void)
1789{
1790 return IS_ENABLED(CONFIG_X86_LOCAL_APIC) &&
1791 vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR;
1792}
1793
1794static inline bool cpu_has_vmx_apicv(void)
1795{
1796 return cpu_has_vmx_apic_register_virt() &&
1797 cpu_has_vmx_virtual_intr_delivery() &&
1798 cpu_has_vmx_posted_intr();
1799}
1800
1801static inline bool cpu_has_vmx_flexpriority(void)
1802{
1803 return cpu_has_vmx_tpr_shadow() &&
1804 cpu_has_vmx_virtualize_apic_accesses();
1805}
1806
1807static inline bool cpu_has_vmx_ept_execute_only(void)
1808{
1809 return vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT;
1810}
1811
1812static inline bool cpu_has_vmx_ept_2m_page(void)
1813{
1814 return vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT;
1815}
1816
1817static inline bool cpu_has_vmx_ept_1g_page(void)
1818{
1819 return vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT;
1820}
1821
1822static inline bool cpu_has_vmx_ept_4levels(void)
1823{
1824 return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT;
1825}
1826
1827static inline bool cpu_has_vmx_ept_mt_wb(void)
1828{
1829 return vmx_capability.ept & VMX_EPTP_WB_BIT;
1830}
1831
1832static inline bool cpu_has_vmx_ept_5levels(void)
1833{
1834 return vmx_capability.ept & VMX_EPT_PAGE_WALK_5_BIT;
1835}
1836
1837static inline bool cpu_has_vmx_ept_ad_bits(void)
1838{
1839 return vmx_capability.ept & VMX_EPT_AD_BIT;
1840}
1841
1842static inline bool cpu_has_vmx_invept_context(void)
1843{
1844 return vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT;
1845}
1846
1847static inline bool cpu_has_vmx_invept_global(void)
1848{
1849 return vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT;
1850}
1851
1852static inline bool cpu_has_vmx_invvpid_individual_addr(void)
1853{
1854 return vmx_capability.vpid & VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT;
1855}
1856
1857static inline bool cpu_has_vmx_invvpid_single(void)
1858{
1859 return vmx_capability.vpid & VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT;
1860}
1861
1862static inline bool cpu_has_vmx_invvpid_global(void)
1863{
1864 return vmx_capability.vpid & VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT;
1865}
1866
1867static inline bool cpu_has_vmx_invvpid(void)
1868{
1869 return vmx_capability.vpid & VMX_VPID_INVVPID_BIT;
1870}
1871
1872static inline bool cpu_has_vmx_ept(void)
1873{
1874 return vmcs_config.cpu_based_2nd_exec_ctrl &
1875 SECONDARY_EXEC_ENABLE_EPT;
1876}
1877
1878static inline bool cpu_has_vmx_unrestricted_guest(void)
1879{
1880 return vmcs_config.cpu_based_2nd_exec_ctrl &
1881 SECONDARY_EXEC_UNRESTRICTED_GUEST;
1882}
1883
1884static inline bool cpu_has_vmx_ple(void)
1885{
1886 return vmcs_config.cpu_based_2nd_exec_ctrl &
1887 SECONDARY_EXEC_PAUSE_LOOP_EXITING;
1888}
1889
1890static inline bool cpu_has_vmx_basic_inout(void)
1891{
1892 return (((u64)vmcs_config.basic_cap << 32) & VMX_BASIC_INOUT);
1893}
1894
1895static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu)
1896{
1897 return flexpriority_enabled && lapic_in_kernel(vcpu);
1898}
1899
1900static inline bool cpu_has_vmx_vpid(void)
1901{
1902 return vmcs_config.cpu_based_2nd_exec_ctrl &
1903 SECONDARY_EXEC_ENABLE_VPID;
1904}
1905
1906static inline bool cpu_has_vmx_rdtscp(void)
1907{
1908 return vmcs_config.cpu_based_2nd_exec_ctrl &
1909 SECONDARY_EXEC_RDTSCP;
1910}
1911
1912static inline bool cpu_has_vmx_invpcid(void)
1913{
1914 return vmcs_config.cpu_based_2nd_exec_ctrl &
1915 SECONDARY_EXEC_ENABLE_INVPCID;
1916}
1917
1918static inline bool cpu_has_virtual_nmis(void)
1919{
1920 return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
1921}
1922
1923static inline bool cpu_has_vmx_wbinvd_exit(void)
1924{
1925 return vmcs_config.cpu_based_2nd_exec_ctrl &
1926 SECONDARY_EXEC_WBINVD_EXITING;
1927}
1928
1929static inline bool cpu_has_vmx_shadow_vmcs(void)
1930{
1931 u64 vmx_msr;
1932 rdmsrl(MSR_IA32_VMX_MISC, vmx_msr);
1933 /* check if the cpu supports writing r/o exit information fields */
1934 if (!(vmx_msr & MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS))
1935 return false;
1936
1937 return vmcs_config.cpu_based_2nd_exec_ctrl &
1938 SECONDARY_EXEC_SHADOW_VMCS;
1939}
1940
1941static inline bool cpu_has_vmx_pml(void)
1942{
1943 return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_ENABLE_PML;
1944}
1945
1946static inline bool cpu_has_vmx_tsc_scaling(void)
1947{
1948 return vmcs_config.cpu_based_2nd_exec_ctrl &
1949 SECONDARY_EXEC_TSC_SCALING;
1950}
1951
1952static inline bool cpu_has_vmx_vmfunc(void)
1953{
1954 return vmcs_config.cpu_based_2nd_exec_ctrl &
1955 SECONDARY_EXEC_ENABLE_VMFUNC;
1956}
1957
1958static bool vmx_umip_emulated(void)
1959{
1960 return vmcs_config.cpu_based_2nd_exec_ctrl &
1961 SECONDARY_EXEC_DESC;
1962}
1963
1964static inline bool report_flexpriority(void)
1965{
1966 return flexpriority_enabled;
1967}
1968
1969static inline unsigned nested_cpu_vmx_misc_cr3_count(struct kvm_vcpu *vcpu)
1970{
1971 return vmx_misc_cr3_count(to_vmx(vcpu)->nested.msrs.misc_low);
1972}
1973
1974/*
1975 * Do the virtual VMX capability MSRs specify that L1 can use VMWRITE
1976 * to modify any valid field of the VMCS, or are the VM-exit
1977 * information fields read-only?
1978 */
1979static inline bool nested_cpu_has_vmwrite_any_field(struct kvm_vcpu *vcpu)
1980{
1981 return to_vmx(vcpu)->nested.msrs.misc_low &
1982 MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS;
1983}
1984
1985static inline bool nested_cpu_has_zero_length_injection(struct kvm_vcpu *vcpu)
1986{
1987 return to_vmx(vcpu)->nested.msrs.misc_low & VMX_MISC_ZERO_LEN_INS;
1988}
1989
1990static inline bool nested_cpu_supports_monitor_trap_flag(struct kvm_vcpu *vcpu)
1991{
1992 return to_vmx(vcpu)->nested.msrs.procbased_ctls_high &
1993 CPU_BASED_MONITOR_TRAP_FLAG;
1994}
1995
1996static inline bool nested_cpu_has_vmx_shadow_vmcs(struct kvm_vcpu *vcpu)
1997{
1998 return to_vmx(vcpu)->nested.msrs.secondary_ctls_high &
1999 SECONDARY_EXEC_SHADOW_VMCS;
2000}
2001
2002static inline bool nested_cpu_has(struct vmcs12 *vmcs12, u32 bit)
2003{
2004 return vmcs12->cpu_based_vm_exec_control & bit;
2005}
2006
2007static inline bool nested_cpu_has2(struct vmcs12 *vmcs12, u32 bit)
2008{
2009 return (vmcs12->cpu_based_vm_exec_control &
2010 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) &&
2011 (vmcs12->secondary_vm_exec_control & bit);
2012}
2013
2014static inline bool nested_cpu_has_preemption_timer(struct vmcs12 *vmcs12)
2015{
2016 return vmcs12->pin_based_vm_exec_control &
2017 PIN_BASED_VMX_PREEMPTION_TIMER;
2018}
2019
2020static inline bool nested_cpu_has_nmi_exiting(struct vmcs12 *vmcs12)
2021{
2022 return vmcs12->pin_based_vm_exec_control & PIN_BASED_NMI_EXITING;
2023}
2024
2025static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12)
2026{
2027 return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS;
2028}
2029
2030static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12)
2031{
2032 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT);
2033}
2034
2035static inline bool nested_cpu_has_xsaves(struct vmcs12 *vmcs12)
2036{
2037 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES);
2038}
2039
2040static inline bool nested_cpu_has_pml(struct vmcs12 *vmcs12)
2041{
2042 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_PML);
2043}
2044
2045static inline bool nested_cpu_has_virt_x2apic_mode(struct vmcs12 *vmcs12)
2046{
2047 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
2048}
2049
2050static inline bool nested_cpu_has_vpid(struct vmcs12 *vmcs12)
2051{
2052 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VPID);
2053}
2054
2055static inline bool nested_cpu_has_apic_reg_virt(struct vmcs12 *vmcs12)
2056{
2057 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_APIC_REGISTER_VIRT);
2058}
2059
2060static inline bool nested_cpu_has_vid(struct vmcs12 *vmcs12)
2061{
2062 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
2063}
2064
2065static inline bool nested_cpu_has_posted_intr(struct vmcs12 *vmcs12)
2066{
2067 return vmcs12->pin_based_vm_exec_control & PIN_BASED_POSTED_INTR;
2068}
2069
2070static inline bool nested_cpu_has_vmfunc(struct vmcs12 *vmcs12)
2071{
2072 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VMFUNC);
2073}
2074
2075static inline bool nested_cpu_has_eptp_switching(struct vmcs12 *vmcs12)
2076{
2077 return nested_cpu_has_vmfunc(vmcs12) &&
2078 (vmcs12->vm_function_control &
2079 VMX_VMFUNC_EPTP_SWITCHING);
2080}
2081
2082static inline bool nested_cpu_has_shadow_vmcs(struct vmcs12 *vmcs12)
2083{
2084 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_SHADOW_VMCS);
2085}
2086
2087static inline bool is_nmi(u32 intr_info)
2088{
2089 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
2090 == (INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK);
2091}
2092
2093static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
2094 u32 exit_intr_info,
2095 unsigned long exit_qualification);
2096
2097static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
2098{
2099 int i;
2100
2101 for (i = 0; i < vmx->nmsrs; ++i)
2102 if (vmx_msr_index[vmx->guest_msrs[i].index] == msr)
2103 return i;
2104 return -1;
2105}
2106
2107static inline void __invvpid(unsigned long ext, u16 vpid, gva_t gva)
2108{
2109 struct {
2110 u64 vpid : 16;
2111 u64 rsvd : 48;
2112 u64 gva;
2113 } operand = { vpid, 0, gva };
2114 bool error;
2115
2116 asm volatile (__ex("invvpid %2, %1") CC_SET(na)
2117 : CC_OUT(na) (error) : "r"(ext), "m"(operand));
2118 BUG_ON(error);
2119}
2120
2121static inline void __invept(unsigned long ext, u64 eptp, gpa_t gpa)
2122{
2123 struct {
2124 u64 eptp, gpa;
2125 } operand = {eptp, gpa};
2126 bool error;
2127
2128 asm volatile (__ex("invept %2, %1") CC_SET(na)
2129 : CC_OUT(na) (error) : "r"(ext), "m"(operand));
2130 BUG_ON(error);
2131}
2132
2133static struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
2134{
2135 int i;
2136
2137 i = __find_msr_index(vmx, msr);
2138 if (i >= 0)
2139 return &vmx->guest_msrs[i];
2140 return NULL;
2141}
2142
2143static void vmcs_clear(struct vmcs *vmcs)
2144{
2145 u64 phys_addr = __pa(vmcs);
2146 bool error;
2147
2148 asm volatile (__ex("vmclear %1") CC_SET(na)
2149 : CC_OUT(na) (error) : "m"(phys_addr));
2150 if (unlikely(error))
2151 printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n",
2152 vmcs, phys_addr);
2153}
2154
2155static inline void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs)
2156{
2157 vmcs_clear(loaded_vmcs->vmcs);
2158 if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched)
2159 vmcs_clear(loaded_vmcs->shadow_vmcs);
2160 loaded_vmcs->cpu = -1;
2161 loaded_vmcs->launched = 0;
2162}
2163
2164static void vmcs_load(struct vmcs *vmcs)
2165{
2166 u64 phys_addr = __pa(vmcs);
2167 bool error;
2168
2169 if (static_branch_unlikely(&enable_evmcs))
2170 return evmcs_load(phys_addr);
2171
2172 asm volatile (__ex("vmptrld %1") CC_SET(na)
2173 : CC_OUT(na) (error) : "m"(phys_addr));
2174 if (unlikely(error))
2175 printk(KERN_ERR "kvm: vmptrld %p/%llx failed\n",
2176 vmcs, phys_addr);
2177}
2178
2179#ifdef CONFIG_KEXEC_CORE
2180/*
2181 * This bitmap is used to indicate whether the vmclear
2182 * operation is enabled on all cpus. All disabled by
2183 * default.
2184 */
2185static cpumask_t crash_vmclear_enabled_bitmap = CPU_MASK_NONE;
2186
2187static inline void crash_enable_local_vmclear(int cpu)
2188{
2189 cpumask_set_cpu(cpu, &crash_vmclear_enabled_bitmap);
2190}
2191
2192static inline void crash_disable_local_vmclear(int cpu)
2193{
2194 cpumask_clear_cpu(cpu, &crash_vmclear_enabled_bitmap);
2195}
2196
2197static inline int crash_local_vmclear_enabled(int cpu)
2198{
2199 return cpumask_test_cpu(cpu, &crash_vmclear_enabled_bitmap);
2200}
2201
2202static void crash_vmclear_local_loaded_vmcss(void)
2203{
2204 int cpu = raw_smp_processor_id();
2205 struct loaded_vmcs *v;
2206
2207 if (!crash_local_vmclear_enabled(cpu))
2208 return;
2209
2210 list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),
2211 loaded_vmcss_on_cpu_link)
2212 vmcs_clear(v->vmcs);
2213}
2214#else
2215static inline void crash_enable_local_vmclear(int cpu) { }
2216static inline void crash_disable_local_vmclear(int cpu) { }
2217#endif /* CONFIG_KEXEC_CORE */
2218
2219static void __loaded_vmcs_clear(void *arg)
2220{
2221 struct loaded_vmcs *loaded_vmcs = arg;
2222 int cpu = raw_smp_processor_id();
2223
2224 if (loaded_vmcs->cpu != cpu)
2225 return; /* vcpu migration can race with cpu offline */
2226 if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
2227 per_cpu(current_vmcs, cpu) = NULL;
2228 crash_disable_local_vmclear(cpu);
2229 list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);
2230
2231 /*
2232 * we should ensure updating loaded_vmcs->loaded_vmcss_on_cpu_link
2233 * is before setting loaded_vmcs->vcpu to -1 which is done in
2234 * loaded_vmcs_init. Otherwise, other cpu can see vcpu = -1 fist
2235 * then adds the vmcs into percpu list before it is deleted.
2236 */
2237 smp_wmb();
2238
2239 loaded_vmcs_init(loaded_vmcs);
2240 crash_enable_local_vmclear(cpu);
2241}
2242
2243static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
2244{
2245 int cpu = loaded_vmcs->cpu;
2246
2247 if (cpu != -1)
2248 smp_call_function_single(cpu,
2249 __loaded_vmcs_clear, loaded_vmcs, 1);
2250}
2251
2252static inline bool vpid_sync_vcpu_addr(int vpid, gva_t addr)
2253{
2254 if (vpid == 0)
2255 return true;
2256
2257 if (cpu_has_vmx_invvpid_individual_addr()) {
2258 __invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR, vpid, addr);
2259 return true;
2260 }
2261
2262 return false;
2263}
2264
2265static inline void vpid_sync_vcpu_single(int vpid)
2266{
2267 if (vpid == 0)
2268 return;
2269
2270 if (cpu_has_vmx_invvpid_single())
2271 __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vpid, 0);
2272}
2273
2274static inline void vpid_sync_vcpu_global(void)
2275{
2276 if (cpu_has_vmx_invvpid_global())
2277 __invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0);
2278}
2279
2280static inline void vpid_sync_context(int vpid)
2281{
2282 if (cpu_has_vmx_invvpid_single())
2283 vpid_sync_vcpu_single(vpid);
2284 else
2285 vpid_sync_vcpu_global();
2286}
2287
2288static inline void ept_sync_global(void)
2289{
2290 __invept(VMX_EPT_EXTENT_GLOBAL, 0, 0);
2291}
2292
2293static inline void ept_sync_context(u64 eptp)
2294{
2295 if (cpu_has_vmx_invept_context())
2296 __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0);
2297 else
2298 ept_sync_global();
2299}
2300
2301static __always_inline void vmcs_check16(unsigned long field)
2302{
2303 BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2000,
2304 "16-bit accessor invalid for 64-bit field");
2305 BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001,
2306 "16-bit accessor invalid for 64-bit high field");
2307 BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000,
2308 "16-bit accessor invalid for 32-bit high field");
2309 BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000,
2310 "16-bit accessor invalid for natural width field");
2311}
2312
2313static __always_inline void vmcs_check32(unsigned long field)
2314{
2315 BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0,
2316 "32-bit accessor invalid for 16-bit field");
2317 BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000,
2318 "32-bit accessor invalid for natural width field");
2319}
2320
2321static __always_inline void vmcs_check64(unsigned long field)
2322{
2323 BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0,
2324 "64-bit accessor invalid for 16-bit field");
2325 BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001,
2326 "64-bit accessor invalid for 64-bit high field");
2327 BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000,
2328 "64-bit accessor invalid for 32-bit field");
2329 BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000,
2330 "64-bit accessor invalid for natural width field");
2331}
2332
2333static __always_inline void vmcs_checkl(unsigned long field)
2334{
2335 BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0,
2336 "Natural width accessor invalid for 16-bit field");
2337 BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2000,
2338 "Natural width accessor invalid for 64-bit field");
2339 BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001,
2340 "Natural width accessor invalid for 64-bit high field");
2341 BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000,
2342 "Natural width accessor invalid for 32-bit field");
2343}
2344
2345static __always_inline unsigned long __vmcs_readl(unsigned long field)
2346{
2347 unsigned long value;
2348
2349 asm volatile (__ex_clear("vmread %1, %0", "%k0")
2350 : "=r"(value) : "r"(field));
2351 return value;
2352}
2353
2354static __always_inline u16 vmcs_read16(unsigned long field)
2355{
2356 vmcs_check16(field);
2357 if (static_branch_unlikely(&enable_evmcs))
2358 return evmcs_read16(field);
2359 return __vmcs_readl(field);
2360}
2361
2362static __always_inline u32 vmcs_read32(unsigned long field)
2363{
2364 vmcs_check32(field);
2365 if (static_branch_unlikely(&enable_evmcs))
2366 return evmcs_read32(field);
2367 return __vmcs_readl(field);
2368}
2369
2370static __always_inline u64 vmcs_read64(unsigned long field)
2371{
2372 vmcs_check64(field);
2373 if (static_branch_unlikely(&enable_evmcs))
2374 return evmcs_read64(field);
2375#ifdef CONFIG_X86_64
2376 return __vmcs_readl(field);
2377#else
2378 return __vmcs_readl(field) | ((u64)__vmcs_readl(field+1) << 32);
2379#endif
2380}
2381
2382static __always_inline unsigned long vmcs_readl(unsigned long field)
2383{
2384 vmcs_checkl(field);
2385 if (static_branch_unlikely(&enable_evmcs))
2386 return evmcs_read64(field);
2387 return __vmcs_readl(field);
2388}
2389
2390static noinline void vmwrite_error(unsigned long field, unsigned long value)
2391{
2392 printk(KERN_ERR "vmwrite error: reg %lx value %lx (err %d)\n",
2393 field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
2394 dump_stack();
2395}
2396
2397static __always_inline void __vmcs_writel(unsigned long field, unsigned long value)
2398{
2399 bool error;
2400
2401 asm volatile (__ex("vmwrite %2, %1") CC_SET(na)
2402 : CC_OUT(na) (error) : "r"(field), "rm"(value));
2403 if (unlikely(error))
2404 vmwrite_error(field, value);
2405}
2406
2407static __always_inline void vmcs_write16(unsigned long field, u16 value)
2408{
2409 vmcs_check16(field);
2410 if (static_branch_unlikely(&enable_evmcs))
2411 return evmcs_write16(field, value);
2412
2413 __vmcs_writel(field, value);
2414}
2415
2416static __always_inline void vmcs_write32(unsigned long field, u32 value)
2417{
2418 vmcs_check32(field);
2419 if (static_branch_unlikely(&enable_evmcs))
2420 return evmcs_write32(field, value);
2421
2422 __vmcs_writel(field, value);
2423}
2424
2425static __always_inline void vmcs_write64(unsigned long field, u64 value)
2426{
2427 vmcs_check64(field);
2428 if (static_branch_unlikely(&enable_evmcs))
2429 return evmcs_write64(field, value);
2430
2431 __vmcs_writel(field, value);
2432#ifndef CONFIG_X86_64
2433 asm volatile ("");
2434 __vmcs_writel(field+1, value >> 32);
2435#endif
2436}
2437
2438static __always_inline void vmcs_writel(unsigned long field, unsigned long value)
2439{
2440 vmcs_checkl(field);
2441 if (static_branch_unlikely(&enable_evmcs))
2442 return evmcs_write64(field, value);
2443
2444 __vmcs_writel(field, value);
2445}
2446
2447static __always_inline void vmcs_clear_bits(unsigned long field, u32 mask)
2448{
2449 BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000,
2450 "vmcs_clear_bits does not support 64-bit fields");
2451 if (static_branch_unlikely(&enable_evmcs))
2452 return evmcs_write32(field, evmcs_read32(field) & ~mask);
2453
2454 __vmcs_writel(field, __vmcs_readl(field) & ~mask);
2455}
2456
2457static __always_inline void vmcs_set_bits(unsigned long field, u32 mask)
2458{
2459 BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000,
2460 "vmcs_set_bits does not support 64-bit fields");
2461 if (static_branch_unlikely(&enable_evmcs))
2462 return evmcs_write32(field, evmcs_read32(field) | mask);
2463
2464 __vmcs_writel(field, __vmcs_readl(field) | mask);
2465}
2466
2467static inline void vm_entry_controls_reset_shadow(struct vcpu_vmx *vmx)
2468{
2469 vmx->vm_entry_controls_shadow = vmcs_read32(VM_ENTRY_CONTROLS);
2470}
2471
2472static inline void vm_entry_controls_init(struct vcpu_vmx *vmx, u32 val)
2473{
2474 vmcs_write32(VM_ENTRY_CONTROLS, val);
2475 vmx->vm_entry_controls_shadow = val;
2476}
2477
2478static inline void vm_entry_controls_set(struct vcpu_vmx *vmx, u32 val)
2479{
2480 if (vmx->vm_entry_controls_shadow != val)
2481 vm_entry_controls_init(vmx, val);
2482}
2483
2484static inline u32 vm_entry_controls_get(struct vcpu_vmx *vmx)
2485{
2486 return vmx->vm_entry_controls_shadow;
2487}
2488
2489
2490static inline void vm_entry_controls_setbit(struct vcpu_vmx *vmx, u32 val)
2491{
2492 vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) | val);
2493}
2494
2495static inline void vm_entry_controls_clearbit(struct vcpu_vmx *vmx, u32 val)
2496{
2497 vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) & ~val);
2498}
2499
2500static inline void vm_exit_controls_reset_shadow(struct vcpu_vmx *vmx)
2501{
2502 vmx->vm_exit_controls_shadow = vmcs_read32(VM_EXIT_CONTROLS);
2503}
2504
2505static inline void vm_exit_controls_init(struct vcpu_vmx *vmx, u32 val)
2506{
2507 vmcs_write32(VM_EXIT_CONTROLS, val);
2508 vmx->vm_exit_controls_shadow = val;
2509}
2510
2511static inline void vm_exit_controls_set(struct vcpu_vmx *vmx, u32 val)
2512{
2513 if (vmx->vm_exit_controls_shadow != val)
2514 vm_exit_controls_init(vmx, val);
2515}
2516
2517static inline u32 vm_exit_controls_get(struct vcpu_vmx *vmx)
2518{
2519 return vmx->vm_exit_controls_shadow;
2520}
2521
2522
2523static inline void vm_exit_controls_setbit(struct vcpu_vmx *vmx, u32 val)
2524{
2525 vm_exit_controls_set(vmx, vm_exit_controls_get(vmx) | val);
2526}
2527
2528static inline void vm_exit_controls_clearbit(struct vcpu_vmx *vmx, u32 val)
2529{
2530 vm_exit_controls_set(vmx, vm_exit_controls_get(vmx) & ~val);
2531}
2532
2533static void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
2534{
2535 vmx->segment_cache.bitmask = 0;
2536}
2537
2538static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg,
2539 unsigned field)
2540{
2541 bool ret;
2542 u32 mask = 1 << (seg * SEG_FIELD_NR + field);
2543
2544 if (!(vmx->vcpu.arch.regs_avail & (1 << VCPU_EXREG_SEGMENTS))) {
2545 vmx->vcpu.arch.regs_avail |= (1 << VCPU_EXREG_SEGMENTS);
2546 vmx->segment_cache.bitmask = 0;
2547 }
2548 ret = vmx->segment_cache.bitmask & mask;
2549 vmx->segment_cache.bitmask |= mask;
2550 return ret;
2551}
2552
2553static u16 vmx_read_guest_seg_selector(struct vcpu_vmx *vmx, unsigned seg)
2554{
2555 u16 *p = &vmx->segment_cache.seg[seg].selector;
2556
2557 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_SEL))
2558 *p = vmcs_read16(kvm_vmx_segment_fields[seg].selector);
2559 return *p;
2560}
2561
2562static ulong vmx_read_guest_seg_base(struct vcpu_vmx *vmx, unsigned seg)
2563{
2564 ulong *p = &vmx->segment_cache.seg[seg].base;
2565
2566 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_BASE))
2567 *p = vmcs_readl(kvm_vmx_segment_fields[seg].base);
2568 return *p;
2569}
2570
2571static u32 vmx_read_guest_seg_limit(struct vcpu_vmx *vmx, unsigned seg)
2572{
2573 u32 *p = &vmx->segment_cache.seg[seg].limit;
2574
2575 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_LIMIT))
2576 *p = vmcs_read32(kvm_vmx_segment_fields[seg].limit);
2577 return *p;
2578}
2579
2580static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg)
2581{
2582 u32 *p = &vmx->segment_cache.seg[seg].ar;
2583
2584 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_AR))
2585 *p = vmcs_read32(kvm_vmx_segment_fields[seg].ar_bytes);
2586 return *p;
2587}
2588
2589static void update_exception_bitmap(struct kvm_vcpu *vcpu)
2590{
2591 u32 eb;
2592
2593 eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
2594 (1u << DB_VECTOR) | (1u << AC_VECTOR);
2595 /*
2596 * Guest access to VMware backdoor ports could legitimately
2597 * trigger #GP because of TSS I/O permission bitmap.
2598 * We intercept those #GP and allow access to them anyway
2599 * as VMware does.
2600 */
2601 if (enable_vmware_backdoor)
2602 eb |= (1u << GP_VECTOR);
2603 if ((vcpu->guest_debug &
2604 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==
2605 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP))
2606 eb |= 1u << BP_VECTOR;
2607 if (to_vmx(vcpu)->rmode.vm86_active)
2608 eb = ~0;
2609 if (enable_ept)
2610 eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
2611
2612 /* When we are running a nested L2 guest and L1 specified for it a
2613 * certain exception bitmap, we must trap the same exceptions and pass
2614 * them to L1. When running L2, we will only handle the exceptions
2615 * specified above if L1 did not want them.
2616 */
2617 if (is_guest_mode(vcpu))
2618 eb |= get_vmcs12(vcpu)->exception_bitmap;
2619
2620 vmcs_write32(EXCEPTION_BITMAP, eb);
2621}
2622
2623/*
2624 * Check if MSR is intercepted for currently loaded MSR bitmap.
2625 */
2626static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
2627{
2628 unsigned long *msr_bitmap;
2629 int f = sizeof(unsigned long);
2630
2631 if (!cpu_has_vmx_msr_bitmap())
2632 return true;
2633
2634 msr_bitmap = to_vmx(vcpu)->loaded_vmcs->msr_bitmap;
2635
2636 if (msr <= 0x1fff) {
2637 return !!test_bit(msr, msr_bitmap + 0x800 / f);
2638 } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
2639 msr &= 0x1fff;
2640 return !!test_bit(msr, msr_bitmap + 0xc00 / f);
2641 }
2642
2643 return true;
2644}
2645
2646/*
2647 * Check if MSR is intercepted for L01 MSR bitmap.
2648 */
2649static bool msr_write_intercepted_l01(struct kvm_vcpu *vcpu, u32 msr)
2650{
2651 unsigned long *msr_bitmap;
2652 int f = sizeof(unsigned long);
2653
2654 if (!cpu_has_vmx_msr_bitmap())
2655 return true;
2656
2657 msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap;
2658
2659 if (msr <= 0x1fff) {
2660 return !!test_bit(msr, msr_bitmap + 0x800 / f);
2661 } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
2662 msr &= 0x1fff;
2663 return !!test_bit(msr, msr_bitmap + 0xc00 / f);
2664 }
2665
2666 return true;
2667}
2668
2669static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx,
2670 unsigned long entry, unsigned long exit)
2671{
2672 vm_entry_controls_clearbit(vmx, entry);
2673 vm_exit_controls_clearbit(vmx, exit);
2674}
2675
2676static int find_msr(struct vmx_msrs *m, unsigned int msr)
2677{
2678 unsigned int i;
2679
2680 for (i = 0; i < m->nr; ++i) {
2681 if (m->val[i].index == msr)
2682 return i;
2683 }
2684 return -ENOENT;
2685}
2686
2687static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
2688{
2689 int i;
2690 struct msr_autoload *m = &vmx->msr_autoload;
2691
2692 switch (msr) {
2693 case MSR_EFER:
2694 if (cpu_has_load_ia32_efer) {
2695 clear_atomic_switch_msr_special(vmx,
2696 VM_ENTRY_LOAD_IA32_EFER,
2697 VM_EXIT_LOAD_IA32_EFER);
2698 return;
2699 }
2700 break;
2701 case MSR_CORE_PERF_GLOBAL_CTRL:
2702 if (cpu_has_load_perf_global_ctrl) {
2703 clear_atomic_switch_msr_special(vmx,
2704 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
2705 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL);
2706 return;
2707 }
2708 break;
2709 }
2710 i = find_msr(&m->guest, msr);
2711 if (i < 0)
2712 goto skip_guest;
2713 --m->guest.nr;
2714 m->guest.val[i] = m->guest.val[m->guest.nr];
2715 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr);
2716
2717skip_guest:
2718 i = find_msr(&m->host, msr);
2719 if (i < 0)
2720 return;
2721
2722 --m->host.nr;
2723 m->host.val[i] = m->host.val[m->host.nr];
2724 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr);
2725}
2726
2727static void add_atomic_switch_msr_special(struct vcpu_vmx *vmx,
2728 unsigned long entry, unsigned long exit,
2729 unsigned long guest_val_vmcs, unsigned long host_val_vmcs,
2730 u64 guest_val, u64 host_val)
2731{
2732 vmcs_write64(guest_val_vmcs, guest_val);
2733 if (host_val_vmcs != HOST_IA32_EFER)
2734 vmcs_write64(host_val_vmcs, host_val);
2735 vm_entry_controls_setbit(vmx, entry);
2736 vm_exit_controls_setbit(vmx, exit);
2737}
2738
2739static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
2740 u64 guest_val, u64 host_val, bool entry_only)
2741{
2742 int i, j = 0;
2743 struct msr_autoload *m = &vmx->msr_autoload;
2744
2745 switch (msr) {
2746 case MSR_EFER:
2747 if (cpu_has_load_ia32_efer) {
2748 add_atomic_switch_msr_special(vmx,
2749 VM_ENTRY_LOAD_IA32_EFER,
2750 VM_EXIT_LOAD_IA32_EFER,
2751 GUEST_IA32_EFER,
2752 HOST_IA32_EFER,
2753 guest_val, host_val);
2754 return;
2755 }
2756 break;
2757 case MSR_CORE_PERF_GLOBAL_CTRL:
2758 if (cpu_has_load_perf_global_ctrl) {
2759 add_atomic_switch_msr_special(vmx,
2760 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
2761 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL,
2762 GUEST_IA32_PERF_GLOBAL_CTRL,
2763 HOST_IA32_PERF_GLOBAL_CTRL,
2764 guest_val, host_val);
2765 return;
2766 }
2767 break;
2768 case MSR_IA32_PEBS_ENABLE:
2769 /* PEBS needs a quiescent period after being disabled (to write
2770 * a record). Disabling PEBS through VMX MSR swapping doesn't
2771 * provide that period, so a CPU could write host's record into
2772 * guest's memory.
2773 */
2774 wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
2775 }
2776
2777 i = find_msr(&m->guest, msr);
2778 if (!entry_only)
2779 j = find_msr(&m->host, msr);
2780
2781 if (i == NR_AUTOLOAD_MSRS || j == NR_AUTOLOAD_MSRS) {
2782 printk_once(KERN_WARNING "Not enough msr switch entries. "
2783 "Can't add msr %x\n", msr);
2784 return;
2785 }
2786 if (i < 0) {
2787 i = m->guest.nr++;
2788 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr);
2789 }
2790 m->guest.val[i].index = msr;
2791 m->guest.val[i].value = guest_val;
2792
2793 if (entry_only)
2794 return;
2795
2796 if (j < 0) {
2797 j = m->host.nr++;
2798 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr);
2799 }
2800 m->host.val[j].index = msr;
2801 m->host.val[j].value = host_val;
2802}
2803
2804static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
2805{
2806 u64 guest_efer = vmx->vcpu.arch.efer;
2807 u64 ignore_bits = 0;
2808
2809 if (!enable_ept) {
2810 /*
2811 * NX is needed to handle CR0.WP=1, CR4.SMEP=1. Testing
2812 * host CPUID is more efficient than testing guest CPUID
2813 * or CR4. Host SMEP is anyway a requirement for guest SMEP.
2814 */
2815 if (boot_cpu_has(X86_FEATURE_SMEP))
2816 guest_efer |= EFER_NX;
2817 else if (!(guest_efer & EFER_NX))
2818 ignore_bits |= EFER_NX;
2819 }
2820
2821 /*
2822 * LMA and LME handled by hardware; SCE meaningless outside long mode.
2823 */
2824 ignore_bits |= EFER_SCE;
2825#ifdef CONFIG_X86_64
2826 ignore_bits |= EFER_LMA | EFER_LME;
2827 /* SCE is meaningful only in long mode on Intel */
2828 if (guest_efer & EFER_LMA)
2829 ignore_bits &= ~(u64)EFER_SCE;
2830#endif
2831
2832 /*
2833 * On EPT, we can't emulate NX, so we must switch EFER atomically.
2834 * On CPUs that support "load IA32_EFER", always switch EFER
2835 * atomically, since it's faster than switching it manually.
2836 */
2837 if (cpu_has_load_ia32_efer ||
2838 (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX))) {
2839 if (!(guest_efer & EFER_LMA))
2840 guest_efer &= ~EFER_LME;
2841 if (guest_efer != host_efer)
2842 add_atomic_switch_msr(vmx, MSR_EFER,
2843 guest_efer, host_efer, false);
2844 else
2845 clear_atomic_switch_msr(vmx, MSR_EFER);
2846 return false;
2847 } else {
2848 clear_atomic_switch_msr(vmx, MSR_EFER);
2849
2850 guest_efer &= ~ignore_bits;
2851 guest_efer |= host_efer & ignore_bits;
2852
2853 vmx->guest_msrs[efer_offset].data = guest_efer;
2854 vmx->guest_msrs[efer_offset].mask = ~ignore_bits;
2855
2856 return true;
2857 }
2858}
2859
2860#ifdef CONFIG_X86_32
2861/*
2862 * On 32-bit kernels, VM exits still load the FS and GS bases from the
2863 * VMCS rather than the segment table. KVM uses this helper to figure
2864 * out the current bases to poke them into the VMCS before entry.
2865 */
2866static unsigned long segment_base(u16 selector)
2867{
2868 struct desc_struct *table;
2869 unsigned long v;
2870
2871 if (!(selector & ~SEGMENT_RPL_MASK))
2872 return 0;
2873
2874 table = get_current_gdt_ro();
2875
2876 if ((selector & SEGMENT_TI_MASK) == SEGMENT_LDT) {
2877 u16 ldt_selector = kvm_read_ldt();
2878
2879 if (!(ldt_selector & ~SEGMENT_RPL_MASK))
2880 return 0;
2881
2882 table = (struct desc_struct *)segment_base(ldt_selector);
2883 }
2884 v = get_desc_base(&table[selector >> 3]);
2885 return v;
2886}
2887#endif
2888
2889static void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
2890{
2891 struct vcpu_vmx *vmx = to_vmx(vcpu);
2892 struct vmcs_host_state *host_state;
2893#ifdef CONFIG_X86_64
2894 int cpu = raw_smp_processor_id();
2895#endif
2896 unsigned long fs_base, gs_base;
2897 u16 fs_sel, gs_sel;
2898 int i;
2899
2900 vmx->req_immediate_exit = false;
2901
2902 /*
2903 * Note that guest MSRs to be saved/restored can also be changed
2904 * when guest state is loaded. This happens when guest transitions
2905 * to/from long-mode by setting MSR_EFER.LMA.
2906 */
2907 if (!vmx->loaded_cpu_state || vmx->guest_msrs_dirty) {
2908 vmx->guest_msrs_dirty = false;
2909 for (i = 0; i < vmx->save_nmsrs; ++i)
2910 kvm_set_shared_msr(vmx->guest_msrs[i].index,
2911 vmx->guest_msrs[i].data,
2912 vmx->guest_msrs[i].mask);
2913
2914 }
2915
2916 if (vmx->loaded_cpu_state)
2917 return;
2918
2919 vmx->loaded_cpu_state = vmx->loaded_vmcs;
2920 host_state = &vmx->loaded_cpu_state->host_state;
2921
2922 /*
2923 * Set host fs and gs selectors. Unfortunately, 22.2.3 does not
2924 * allow segment selectors with cpl > 0 or ti == 1.
2925 */
2926 host_state->ldt_sel = kvm_read_ldt();
2927
2928#ifdef CONFIG_X86_64
2929 savesegment(ds, host_state->ds_sel);
2930 savesegment(es, host_state->es_sel);
2931
2932 gs_base = cpu_kernelmode_gs_base(cpu);
2933 if (likely(is_64bit_mm(current->mm))) {
2934 save_fsgs_for_kvm();
2935 fs_sel = current->thread.fsindex;
2936 gs_sel = current->thread.gsindex;
2937 fs_base = current->thread.fsbase;
2938 vmx->msr_host_kernel_gs_base = current->thread.gsbase;
2939 } else {
2940 savesegment(fs, fs_sel);
2941 savesegment(gs, gs_sel);
2942 fs_base = read_msr(MSR_FS_BASE);
2943 vmx->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
2944 }
2945
2946 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
2947#else
2948 savesegment(fs, fs_sel);
2949 savesegment(gs, gs_sel);
2950 fs_base = segment_base(fs_sel);
2951 gs_base = segment_base(gs_sel);
2952#endif
2953
2954 if (unlikely(fs_sel != host_state->fs_sel)) {
2955 if (!(fs_sel & 7))
2956 vmcs_write16(HOST_FS_SELECTOR, fs_sel);
2957 else
2958 vmcs_write16(HOST_FS_SELECTOR, 0);
2959 host_state->fs_sel = fs_sel;
2960 }
2961 if (unlikely(gs_sel != host_state->gs_sel)) {
2962 if (!(gs_sel & 7))
2963 vmcs_write16(HOST_GS_SELECTOR, gs_sel);
2964 else
2965 vmcs_write16(HOST_GS_SELECTOR, 0);
2966 host_state->gs_sel = gs_sel;
2967 }
2968 if (unlikely(fs_base != host_state->fs_base)) {
2969 vmcs_writel(HOST_FS_BASE, fs_base);
2970 host_state->fs_base = fs_base;
2971 }
2972 if (unlikely(gs_base != host_state->gs_base)) {
2973 vmcs_writel(HOST_GS_BASE, gs_base);
2974 host_state->gs_base = gs_base;
2975 }
2976}
2977
2978static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
2979{
2980 struct vmcs_host_state *host_state;
2981
2982 if (!vmx->loaded_cpu_state)
2983 return;
2984
2985 WARN_ON_ONCE(vmx->loaded_cpu_state != vmx->loaded_vmcs);
2986 host_state = &vmx->loaded_cpu_state->host_state;
2987
2988 ++vmx->vcpu.stat.host_state_reload;
2989 vmx->loaded_cpu_state = NULL;
2990
2991#ifdef CONFIG_X86_64
2992 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
2993#endif
2994 if (host_state->ldt_sel || (host_state->gs_sel & 7)) {
2995 kvm_load_ldt(host_state->ldt_sel);
2996#ifdef CONFIG_X86_64
2997 load_gs_index(host_state->gs_sel);
2998#else
2999 loadsegment(gs, host_state->gs_sel);
3000#endif
3001 }
3002 if (host_state->fs_sel & 7)
3003 loadsegment(fs, host_state->fs_sel);
3004#ifdef CONFIG_X86_64
3005 if (unlikely(host_state->ds_sel | host_state->es_sel)) {
3006 loadsegment(ds, host_state->ds_sel);
3007 loadsegment(es, host_state->es_sel);
3008 }
3009#endif
3010 invalidate_tss_limit();
3011#ifdef CONFIG_X86_64
3012 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
3013#endif
3014 load_fixmap_gdt(raw_smp_processor_id());
3015}
3016
3017#ifdef CONFIG_X86_64
3018static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx)
3019{
3020 preempt_disable();
3021 if (vmx->loaded_cpu_state)
3022 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
3023 preempt_enable();
3024 return vmx->msr_guest_kernel_gs_base;
3025}
3026
3027static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data)
3028{
3029 preempt_disable();
3030 if (vmx->loaded_cpu_state)
3031 wrmsrl(MSR_KERNEL_GS_BASE, data);
3032 preempt_enable();
3033 vmx->msr_guest_kernel_gs_base = data;
3034}
3035#endif
3036
3037static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
3038{
3039 struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
3040 struct pi_desc old, new;
3041 unsigned int dest;
3042
3043 /*
3044 * In case of hot-plug or hot-unplug, we may have to undo
3045 * vmx_vcpu_pi_put even if there is no assigned device. And we
3046 * always keep PI.NDST up to date for simplicity: it makes the
3047 * code easier, and CPU migration is not a fast path.
3048 */
3049 if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu)
3050 return;
3051
3052 /*
3053 * First handle the simple case where no cmpxchg is necessary; just
3054 * allow posting non-urgent interrupts.
3055 *
3056 * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change
3057 * PI.NDST: pi_post_block will do it for us and the wakeup_handler
3058 * expects the VCPU to be on the blocked_vcpu_list that matches
3059 * PI.NDST.
3060 */
3061 if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR ||
3062 vcpu->cpu == cpu) {
3063 pi_clear_sn(pi_desc);
3064 return;
3065 }
3066
3067 /* The full case. */
3068 do {
3069 old.control = new.control = pi_desc->control;
3070
3071 dest = cpu_physical_id(cpu);
3072
3073 if (x2apic_enabled())
3074 new.ndst = dest;
3075 else
3076 new.ndst = (dest << 8) & 0xFF00;
3077
3078 new.sn = 0;
3079 } while (cmpxchg64(&pi_desc->control, old.control,
3080 new.control) != old.control);
3081}
3082
3083static void decache_tsc_multiplier(struct vcpu_vmx *vmx)
3084{
3085 vmx->current_tsc_ratio = vmx->vcpu.arch.tsc_scaling_ratio;
3086 vmcs_write64(TSC_MULTIPLIER, vmx->current_tsc_ratio);
3087}
3088
3089/*
3090 * Switches to specified vcpu, until a matching vcpu_put(), but assumes
3091 * vcpu mutex is already taken.
3092 */
3093static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
3094{
3095 struct vcpu_vmx *vmx = to_vmx(vcpu);
3096 bool already_loaded = vmx->loaded_vmcs->cpu == cpu;
3097
3098 if (!already_loaded) {
3099 loaded_vmcs_clear(vmx->loaded_vmcs);
3100 local_irq_disable();
3101 crash_disable_local_vmclear(cpu);
3102
3103 /*
3104 * Read loaded_vmcs->cpu should be before fetching
3105 * loaded_vmcs->loaded_vmcss_on_cpu_link.
3106 * See the comments in __loaded_vmcs_clear().
3107 */
3108 smp_rmb();
3109
3110 list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
3111 &per_cpu(loaded_vmcss_on_cpu, cpu));
3112 crash_enable_local_vmclear(cpu);
3113 local_irq_enable();
3114 }
3115
3116 if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) {
3117 per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
3118 vmcs_load(vmx->loaded_vmcs->vmcs);
3119 indirect_branch_prediction_barrier();
3120 }
3121
3122 if (!already_loaded) {
3123 void *gdt = get_current_gdt_ro();
3124 unsigned long sysenter_esp;
3125
3126 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
3127
3128 /*
3129 * Linux uses per-cpu TSS and GDT, so set these when switching
3130 * processors. See 22.2.4.
3131 */
3132 vmcs_writel(HOST_TR_BASE,
3133 (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss);
3134 vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */
3135
3136 /*
3137 * VM exits change the host TR limit to 0x67 after a VM
3138 * exit. This is okay, since 0x67 covers everything except
3139 * the IO bitmap and have have code to handle the IO bitmap
3140 * being lost after a VM exit.
3141 */
3142 BUILD_BUG_ON(IO_BITMAP_OFFSET - 1 != 0x67);
3143
3144 rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
3145 vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
3146
3147 vmx->loaded_vmcs->cpu = cpu;
3148 }
3149
3150 /* Setup TSC multiplier */
3151 if (kvm_has_tsc_control &&
3152 vmx->current_tsc_ratio != vcpu->arch.tsc_scaling_ratio)
3153 decache_tsc_multiplier(vmx);
3154
3155 vmx_vcpu_pi_load(vcpu, cpu);
3156 vmx->host_pkru = read_pkru();
3157 vmx->host_debugctlmsr = get_debugctlmsr();
3158}
3159
3160static void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
3161{
3162 struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
3163
3164 if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
3165 !irq_remapping_cap(IRQ_POSTING_CAP) ||
3166 !kvm_vcpu_apicv_active(vcpu))
3167 return;
3168
3169 /* Set SN when the vCPU is preempted */
3170 if (vcpu->preempted)
3171 pi_set_sn(pi_desc);
3172}
3173
3174static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
3175{
3176 vmx_vcpu_pi_put(vcpu);
3177
3178 vmx_prepare_switch_to_host(to_vmx(vcpu));
3179}
3180
3181static bool emulation_required(struct kvm_vcpu *vcpu)
3182{
3183 return emulate_invalid_guest_state && !guest_state_valid(vcpu);
3184}
3185
3186static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu);
3187
3188/*
3189 * Return the cr0 value that a nested guest would read. This is a combination
3190 * of the real cr0 used to run the guest (guest_cr0), and the bits shadowed by
3191 * its hypervisor (cr0_read_shadow).
3192 */
3193static inline unsigned long nested_read_cr0(struct vmcs12 *fields)
3194{
3195 return (fields->guest_cr0 & ~fields->cr0_guest_host_mask) |
3196 (fields->cr0_read_shadow & fields->cr0_guest_host_mask);
3197}
3198static inline unsigned long nested_read_cr4(struct vmcs12 *fields)
3199{
3200 return (fields->guest_cr4 & ~fields->cr4_guest_host_mask) |
3201 (fields->cr4_read_shadow & fields->cr4_guest_host_mask);
3202}
3203
3204static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
3205{
3206 unsigned long rflags, save_rflags;
3207
3208 if (!test_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail)) {
3209 __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail);
3210 rflags = vmcs_readl(GUEST_RFLAGS);
3211 if (to_vmx(vcpu)->rmode.vm86_active) {
3212 rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
3213 save_rflags = to_vmx(vcpu)->rmode.save_rflags;
3214 rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
3215 }
3216 to_vmx(vcpu)->rflags = rflags;
3217 }
3218 return to_vmx(vcpu)->rflags;
3219}
3220
3221static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
3222{
3223 unsigned long old_rflags = vmx_get_rflags(vcpu);
3224
3225 __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail);
3226 to_vmx(vcpu)->rflags = rflags;
3227 if (to_vmx(vcpu)->rmode.vm86_active) {
3228 to_vmx(vcpu)->rmode.save_rflags = rflags;
3229 rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
3230 }
3231 vmcs_writel(GUEST_RFLAGS, rflags);
3232
3233 if ((old_rflags ^ to_vmx(vcpu)->rflags) & X86_EFLAGS_VM)
3234 to_vmx(vcpu)->emulation_required = emulation_required(vcpu);
3235}
3236
3237static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu)
3238{
3239 u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
3240 int ret = 0;
3241
3242 if (interruptibility & GUEST_INTR_STATE_STI)
3243 ret |= KVM_X86_SHADOW_INT_STI;
3244 if (interruptibility & GUEST_INTR_STATE_MOV_SS)
3245 ret |= KVM_X86_SHADOW_INT_MOV_SS;
3246
3247 return ret;
3248}
3249
3250static void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
3251{
3252 u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
3253 u32 interruptibility = interruptibility_old;
3254
3255 interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS);
3256
3257 if (mask & KVM_X86_SHADOW_INT_MOV_SS)
3258 interruptibility |= GUEST_INTR_STATE_MOV_SS;
3259 else if (mask & KVM_X86_SHADOW_INT_STI)
3260 interruptibility |= GUEST_INTR_STATE_STI;
3261
3262 if ((interruptibility != interruptibility_old))
3263 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility);
3264}
3265
3266static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
3267{
3268 unsigned long rip;
3269
3270 rip = kvm_rip_read(vcpu);
3271 rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
3272 kvm_rip_write(vcpu, rip);
3273
3274 /* skipping an emulated instruction also counts */
3275 vmx_set_interrupt_shadow(vcpu, 0);
3276}
3277
3278static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu,
3279 unsigned long exit_qual)
3280{
3281 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
3282 unsigned int nr = vcpu->arch.exception.nr;
3283 u32 intr_info = nr | INTR_INFO_VALID_MASK;
3284
3285 if (vcpu->arch.exception.has_error_code) {
3286 vmcs12->vm_exit_intr_error_code = vcpu->arch.exception.error_code;
3287 intr_info |= INTR_INFO_DELIVER_CODE_MASK;
3288 }
3289
3290 if (kvm_exception_is_soft(nr))
3291 intr_info |= INTR_TYPE_SOFT_EXCEPTION;
3292 else
3293 intr_info |= INTR_TYPE_HARD_EXCEPTION;
3294
3295 if (!(vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) &&
3296 vmx_get_nmi_mask(vcpu))
3297 intr_info |= INTR_INFO_UNBLOCK_NMI;
3298
3299 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual);
3300}
3301
3302/*
3303 * KVM wants to inject page-faults which it got to the guest. This function
3304 * checks whether in a nested guest, we need to inject them to L1 or L2.
3305 */
3306static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit_qual)
3307{
3308 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
3309 unsigned int nr = vcpu->arch.exception.nr;
3310 bool has_payload = vcpu->arch.exception.has_payload;
3311 unsigned long payload = vcpu->arch.exception.payload;
3312
3313 if (nr == PF_VECTOR) {
3314 if (vcpu->arch.exception.nested_apf) {
3315 *exit_qual = vcpu->arch.apf.nested_apf_token;
3316 return 1;
3317 }
3318 if (nested_vmx_is_page_fault_vmexit(vmcs12,
3319 vcpu->arch.exception.error_code)) {
3320 *exit_qual = has_payload ? payload : vcpu->arch.cr2;
3321 return 1;
3322 }
3323 } else if (vmcs12->exception_bitmap & (1u << nr)) {
3324 if (nr == DB_VECTOR) {
3325 if (!has_payload) {
3326 payload = vcpu->arch.dr6;
3327 payload &= ~(DR6_FIXED_1 | DR6_BT);
3328 payload ^= DR6_RTM;
3329 }
3330 *exit_qual = payload;
3331 } else
3332 *exit_qual = 0;
3333 return 1;
3334 }
3335
3336 return 0;
3337}
3338
3339static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
3340{
3341 /*
3342 * Ensure that we clear the HLT state in the VMCS. We don't need to
3343 * explicitly skip the instruction because if the HLT state is set,
3344 * then the instruction is already executing and RIP has already been
3345 * advanced.
3346 */
3347 if (kvm_hlt_in_guest(vcpu->kvm) &&
3348 vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT)
3349 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
3350}
3351
3352static void vmx_queue_exception(struct kvm_vcpu *vcpu)
3353{
3354 struct vcpu_vmx *vmx = to_vmx(vcpu);
3355 unsigned nr = vcpu->arch.exception.nr;
3356 bool has_error_code = vcpu->arch.exception.has_error_code;
3357 u32 error_code = vcpu->arch.exception.error_code;
3358 u32 intr_info = nr | INTR_INFO_VALID_MASK;
3359
3360 kvm_deliver_exception_payload(vcpu);
3361
3362 if (has_error_code) {
3363 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
3364 intr_info |= INTR_INFO_DELIVER_CODE_MASK;
3365 }
3366
3367 if (vmx->rmode.vm86_active) {
3368 int inc_eip = 0;
3369 if (kvm_exception_is_soft(nr))
3370 inc_eip = vcpu->arch.event_exit_inst_len;
3371 if (kvm_inject_realmode_interrupt(vcpu, nr, inc_eip) != EMULATE_DONE)
3372 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
3373 return;
3374 }
3375
3376 WARN_ON_ONCE(vmx->emulation_required);
3377
3378 if (kvm_exception_is_soft(nr)) {
3379 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
3380 vmx->vcpu.arch.event_exit_inst_len);
3381 intr_info |= INTR_TYPE_SOFT_EXCEPTION;
3382 } else
3383 intr_info |= INTR_TYPE_HARD_EXCEPTION;
3384
3385 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
3386
3387 vmx_clear_hlt(vcpu);
3388}
3389
3390static bool vmx_rdtscp_supported(void)
3391{
3392 return cpu_has_vmx_rdtscp();
3393}
3394
3395static bool vmx_invpcid_supported(void)
3396{
3397 return cpu_has_vmx_invpcid();
3398}
3399
3400/*
3401 * Swap MSR entry in host/guest MSR entry array.
3402 */
3403static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
3404{
3405 struct shared_msr_entry tmp;
3406
3407 tmp = vmx->guest_msrs[to];
3408 vmx->guest_msrs[to] = vmx->guest_msrs[from];
3409 vmx->guest_msrs[from] = tmp;
3410}
3411
3412/*
3413 * Set up the vmcs to automatically save and restore system
3414 * msrs. Don't touch the 64-bit msrs if the guest is in legacy
3415 * mode, as fiddling with msrs is very expensive.
3416 */
3417static void setup_msrs(struct vcpu_vmx *vmx)
3418{
3419 int save_nmsrs, index;
3420
3421 save_nmsrs = 0;
3422#ifdef CONFIG_X86_64
3423 if (is_long_mode(&vmx->vcpu)) {
3424 index = __find_msr_index(vmx, MSR_SYSCALL_MASK);
3425 if (index >= 0)
3426 move_msr_up(vmx, index, save_nmsrs++);
3427 index = __find_msr_index(vmx, MSR_LSTAR);
3428 if (index >= 0)
3429 move_msr_up(vmx, index, save_nmsrs++);
3430 index = __find_msr_index(vmx, MSR_CSTAR);
3431 if (index >= 0)
3432 move_msr_up(vmx, index, save_nmsrs++);
3433 index = __find_msr_index(vmx, MSR_TSC_AUX);
3434 if (index >= 0 && guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP))
3435 move_msr_up(vmx, index, save_nmsrs++);
3436 /*
3437 * MSR_STAR is only needed on long mode guests, and only
3438 * if efer.sce is enabled.
3439 */
3440 index = __find_msr_index(vmx, MSR_STAR);
3441 if ((index >= 0) && (vmx->vcpu.arch.efer & EFER_SCE))
3442 move_msr_up(vmx, index, save_nmsrs++);
3443 }
3444#endif
3445 index = __find_msr_index(vmx, MSR_EFER);
3446 if (index >= 0 && update_transition_efer(vmx, index))
3447 move_msr_up(vmx, index, save_nmsrs++);
3448
3449 vmx->save_nmsrs = save_nmsrs;
3450 vmx->guest_msrs_dirty = true;
3451
3452 if (cpu_has_vmx_msr_bitmap())
3453 vmx_update_msr_bitmap(&vmx->vcpu);
3454}
3455
3456static u64 vmx_read_l1_tsc_offset(struct kvm_vcpu *vcpu)
3457{
3458 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
3459
3460 if (is_guest_mode(vcpu) &&
3461 (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING))
3462 return vcpu->arch.tsc_offset - vmcs12->tsc_offset;
3463
3464 return vcpu->arch.tsc_offset;
3465}
3466
3467static u64 vmx_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
3468{
3469 u64 active_offset = offset;
3470 if (is_guest_mode(vcpu)) {
3471 /*
3472 * We're here if L1 chose not to trap WRMSR to TSC. According
3473 * to the spec, this should set L1's TSC; The offset that L1
3474 * set for L2 remains unchanged, and still needs to be added
3475 * to the newly set TSC to get L2's TSC.
3476 */
3477 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
3478 if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETING))
3479 active_offset += vmcs12->tsc_offset;
3480 } else {
3481 trace_kvm_write_tsc_offset(vcpu->vcpu_id,
3482 vmcs_read64(TSC_OFFSET), offset);
3483 }
3484
3485 vmcs_write64(TSC_OFFSET, active_offset);
3486 return active_offset;
3487}
3488
3489/*
3490 * nested_vmx_allowed() checks whether a guest should be allowed to use VMX
3491 * instructions and MSRs (i.e., nested VMX). Nested VMX is disabled for
3492 * all guests if the "nested" module option is off, and can also be disabled
3493 * for a single guest by disabling its VMX cpuid bit.
3494 */
3495static inline bool nested_vmx_allowed(struct kvm_vcpu *vcpu)
3496{
3497 return nested && guest_cpuid_has(vcpu, X86_FEATURE_VMX);
3498}
3499
3500/*
3501 * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be
3502 * returned for the various VMX controls MSRs when nested VMX is enabled.
3503 * The same values should also be used to verify that vmcs12 control fields are
3504 * valid during nested entry from L1 to L2.
3505 * Each of these control msrs has a low and high 32-bit half: A low bit is on
3506 * if the corresponding bit in the (32-bit) control field *must* be on, and a
3507 * bit in the high half is on if the corresponding bit in the control field
3508 * may be on. See also vmx_control_verify().
3509 */
3510static void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, bool apicv)
3511{
3512 if (!nested) {
3513 memset(msrs, 0, sizeof(*msrs));
3514 return;
3515 }
3516
3517 /*
3518 * Note that as a general rule, the high half of the MSRs (bits in
3519 * the control fields which may be 1) should be initialized by the
3520 * intersection of the underlying hardware's MSR (i.e., features which
3521 * can be supported) and the list of features we want to expose -
3522 * because they are known to be properly supported in our code.
3523 * Also, usually, the low half of the MSRs (bits which must be 1) can
3524 * be set to 0, meaning that L1 may turn off any of these bits. The
3525 * reason is that if one of these bits is necessary, it will appear
3526 * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control
3527 * fields of vmcs01 and vmcs02, will turn these bits off - and
3528 * nested_vmx_exit_reflected() will not pass related exits to L1.
3529 * These rules have exceptions below.
3530 */
3531
3532 /* pin-based controls */
3533 rdmsr(MSR_IA32_VMX_PINBASED_CTLS,
3534 msrs->pinbased_ctls_low,
3535 msrs->pinbased_ctls_high);
3536 msrs->pinbased_ctls_low |=
3537 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
3538 msrs->pinbased_ctls_high &=
3539 PIN_BASED_EXT_INTR_MASK |
3540 PIN_BASED_NMI_EXITING |
3541 PIN_BASED_VIRTUAL_NMIS |
3542 (apicv ? PIN_BASED_POSTED_INTR : 0);
3543 msrs->pinbased_ctls_high |=
3544 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
3545 PIN_BASED_VMX_PREEMPTION_TIMER;
3546
3547 /* exit controls */
3548 rdmsr(MSR_IA32_VMX_EXIT_CTLS,
3549 msrs->exit_ctls_low,
3550 msrs->exit_ctls_high);
3551 msrs->exit_ctls_low =
3552 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
3553
3554 msrs->exit_ctls_high &=
3555#ifdef CONFIG_X86_64
3556 VM_EXIT_HOST_ADDR_SPACE_SIZE |
3557#endif
3558 VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
3559 msrs->exit_ctls_high |=
3560 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
3561 VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER |
3562 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT;
3563
3564 /* We support free control of debug control saving. */
3565 msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS;
3566
3567 /* entry controls */
3568 rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
3569 msrs->entry_ctls_low,
3570 msrs->entry_ctls_high);
3571 msrs->entry_ctls_low =
3572 VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
3573 msrs->entry_ctls_high &=
3574#ifdef CONFIG_X86_64
3575 VM_ENTRY_IA32E_MODE |
3576#endif
3577 VM_ENTRY_LOAD_IA32_PAT;
3578 msrs->entry_ctls_high |=
3579 (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER);
3580
3581 /* We support free control of debug control loading. */
3582 msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS;
3583
3584 /* cpu-based controls */
3585 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
3586 msrs->procbased_ctls_low,
3587 msrs->procbased_ctls_high);
3588 msrs->procbased_ctls_low =
3589 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
3590 msrs->procbased_ctls_high &=
3591 CPU_BASED_VIRTUAL_INTR_PENDING |
3592 CPU_BASED_VIRTUAL_NMI_PENDING | CPU_BASED_USE_TSC_OFFSETING |
3593 CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING |
3594 CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING |
3595 CPU_BASED_CR3_STORE_EXITING |
3596#ifdef CONFIG_X86_64
3597 CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING |
3598#endif
3599 CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING |
3600 CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_TRAP_FLAG |
3601 CPU_BASED_MONITOR_EXITING | CPU_BASED_RDPMC_EXITING |
3602 CPU_BASED_RDTSC_EXITING | CPU_BASED_PAUSE_EXITING |
3603 CPU_BASED_TPR_SHADOW | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
3604 /*
3605 * We can allow some features even when not supported by the
3606 * hardware. For example, L1 can specify an MSR bitmap - and we
3607 * can use it to avoid exits to L1 - even when L0 runs L2
3608 * without MSR bitmaps.
3609 */
3610 msrs->procbased_ctls_high |=
3611 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
3612 CPU_BASED_USE_MSR_BITMAPS;
3613
3614 /* We support free control of CR3 access interception. */
3615 msrs->procbased_ctls_low &=
3616 ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING);
3617
3618 /*
3619 * secondary cpu-based controls. Do not include those that
3620 * depend on CPUID bits, they are added later by vmx_cpuid_update.
3621 */
3622 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
3623 msrs->secondary_ctls_low,
3624 msrs->secondary_ctls_high);
3625 msrs->secondary_ctls_low = 0;
3626 msrs->secondary_ctls_high &=
3627 SECONDARY_EXEC_DESC |
3628 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
3629 SECONDARY_EXEC_APIC_REGISTER_VIRT |
3630 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
3631 SECONDARY_EXEC_WBINVD_EXITING;
3632
3633 /*
3634 * We can emulate "VMCS shadowing," even if the hardware
3635 * doesn't support it.
3636 */
3637 msrs->secondary_ctls_high |=
3638 SECONDARY_EXEC_SHADOW_VMCS;
3639
3640 if (enable_ept) {
3641 /* nested EPT: emulate EPT also to L1 */
3642 msrs->secondary_ctls_high |=
3643 SECONDARY_EXEC_ENABLE_EPT;
3644 msrs->ept_caps = VMX_EPT_PAGE_WALK_4_BIT |
3645 VMX_EPTP_WB_BIT | VMX_EPT_INVEPT_BIT;
3646 if (cpu_has_vmx_ept_execute_only())
3647 msrs->ept_caps |=
3648 VMX_EPT_EXECUTE_ONLY_BIT;
3649 msrs->ept_caps &= vmx_capability.ept;
3650 msrs->ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT |
3651 VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT |
3652 VMX_EPT_1GB_PAGE_BIT;
3653 if (enable_ept_ad_bits) {
3654 msrs->secondary_ctls_high |=
3655 SECONDARY_EXEC_ENABLE_PML;
3656 msrs->ept_caps |= VMX_EPT_AD_BIT;
3657 }
3658 }
3659
3660 if (cpu_has_vmx_vmfunc()) {
3661 msrs->secondary_ctls_high |=
3662 SECONDARY_EXEC_ENABLE_VMFUNC;
3663 /*
3664 * Advertise EPTP switching unconditionally
3665 * since we emulate it
3666 */
3667 if (enable_ept)
3668 msrs->vmfunc_controls =
3669 VMX_VMFUNC_EPTP_SWITCHING;
3670 }
3671
3672 /*
3673 * Old versions of KVM use the single-context version without
3674 * checking for support, so declare that it is supported even
3675 * though it is treated as global context. The alternative is
3676 * not failing the single-context invvpid, and it is worse.
3677 */
3678 if (enable_vpid) {
3679 msrs->secondary_ctls_high |=
3680 SECONDARY_EXEC_ENABLE_VPID;
3681 msrs->vpid_caps = VMX_VPID_INVVPID_BIT |
3682 VMX_VPID_EXTENT_SUPPORTED_MASK;
3683 }
3684
3685 if (enable_unrestricted_guest)
3686 msrs->secondary_ctls_high |=
3687 SECONDARY_EXEC_UNRESTRICTED_GUEST;
3688
3689 if (flexpriority_enabled)
3690 msrs->secondary_ctls_high |=
3691 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
3692
3693 /* miscellaneous data */
3694 rdmsr(MSR_IA32_VMX_MISC,
3695 msrs->misc_low,
3696 msrs->misc_high);
3697 msrs->misc_low &= VMX_MISC_SAVE_EFER_LMA;
3698 msrs->misc_low |=
3699 MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS |
3700 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE |
3701 VMX_MISC_ACTIVITY_HLT;
3702 msrs->misc_high = 0;
3703
3704 /*
3705 * This MSR reports some information about VMX support. We
3706 * should return information about the VMX we emulate for the
3707 * guest, and the VMCS structure we give it - not about the
3708 * VMX support of the underlying hardware.
3709 */
3710 msrs->basic =
3711 VMCS12_REVISION |
3712 VMX_BASIC_TRUE_CTLS |
3713 ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) |
3714 (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT);
3715
3716 if (cpu_has_vmx_basic_inout())
3717 msrs->basic |= VMX_BASIC_INOUT;
3718
3719 /*
3720 * These MSRs specify bits which the guest must keep fixed on
3721 * while L1 is in VMXON mode (in L1's root mode, or running an L2).
3722 * We picked the standard core2 setting.
3723 */
3724#define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE)
3725#define VMXON_CR4_ALWAYSON X86_CR4_VMXE
3726 msrs->cr0_fixed0 = VMXON_CR0_ALWAYSON;
3727 msrs->cr4_fixed0 = VMXON_CR4_ALWAYSON;
3728
3729 /* These MSRs specify bits which the guest must keep fixed off. */
3730 rdmsrl(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1);
3731 rdmsrl(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1);
3732
3733 /* highest index: VMX_PREEMPTION_TIMER_VALUE */
3734 msrs->vmcs_enum = VMCS12_MAX_FIELD_INDEX << 1;
3735}
3736
3737/*
3738 * if fixed0[i] == 1: val[i] must be 1
3739 * if fixed1[i] == 0: val[i] must be 0
3740 */
3741static inline bool fixed_bits_valid(u64 val, u64 fixed0, u64 fixed1)
3742{
3743 return ((val & fixed1) | fixed0) == val;
3744}
3745
3746static inline bool vmx_control_verify(u32 control, u32 low, u32 high)
3747{
3748 return fixed_bits_valid(control, low, high);
3749}
3750
3751static inline u64 vmx_control_msr(u32 low, u32 high)
3752{
3753 return low | ((u64)high << 32);
3754}
3755
3756static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask)
3757{
3758 superset &= mask;
3759 subset &= mask;
3760
3761 return (superset | subset) == superset;
3762}
3763
3764static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data)
3765{
3766 const u64 feature_and_reserved =
3767 /* feature (except bit 48; see below) */
3768 BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) |
3769 /* reserved */
3770 BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56);
3771 u64 vmx_basic = vmx->nested.msrs.basic;
3772
3773 if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved))
3774 return -EINVAL;
3775
3776 /*
3777 * KVM does not emulate a version of VMX that constrains physical
3778 * addresses of VMX structures (e.g. VMCS) to 32-bits.
3779 */
3780 if (data & BIT_ULL(48))
3781 return -EINVAL;
3782
3783 if (vmx_basic_vmcs_revision_id(vmx_basic) !=
3784 vmx_basic_vmcs_revision_id(data))
3785 return -EINVAL;
3786
3787 if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data))
3788 return -EINVAL;
3789
3790 vmx->nested.msrs.basic = data;
3791 return 0;
3792}
3793
3794static int
3795vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
3796{
3797 u64 supported;
3798 u32 *lowp, *highp;
3799
3800 switch (msr_index) {
3801 case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
3802 lowp = &vmx->nested.msrs.pinbased_ctls_low;
3803 highp = &vmx->nested.msrs.pinbased_ctls_high;
3804 break;
3805 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
3806 lowp = &vmx->nested.msrs.procbased_ctls_low;
3807 highp = &vmx->nested.msrs.procbased_ctls_high;
3808 break;
3809 case MSR_IA32_VMX_TRUE_EXIT_CTLS:
3810 lowp = &vmx->nested.msrs.exit_ctls_low;
3811 highp = &vmx->nested.msrs.exit_ctls_high;
3812 break;
3813 case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
3814 lowp = &vmx->nested.msrs.entry_ctls_low;
3815 highp = &vmx->nested.msrs.entry_ctls_high;
3816 break;
3817 case MSR_IA32_VMX_PROCBASED_CTLS2:
3818 lowp = &vmx->nested.msrs.secondary_ctls_low;
3819 highp = &vmx->nested.msrs.secondary_ctls_high;
3820 break;
3821 default:
3822 BUG();
3823 }
3824
3825 supported = vmx_control_msr(*lowp, *highp);
3826
3827 /* Check must-be-1 bits are still 1. */
3828 if (!is_bitwise_subset(data, supported, GENMASK_ULL(31, 0)))
3829 return -EINVAL;
3830
3831 /* Check must-be-0 bits are still 0. */
3832 if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32)))
3833 return -EINVAL;
3834
3835 *lowp = data;
3836 *highp = data >> 32;
3837 return 0;
3838}
3839
3840static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data)
3841{
3842 const u64 feature_and_reserved_bits =
3843 /* feature */
3844 BIT_ULL(5) | GENMASK_ULL(8, 6) | BIT_ULL(14) | BIT_ULL(15) |
3845 BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) |
3846 /* reserved */
3847 GENMASK_ULL(13, 9) | BIT_ULL(31);
3848 u64 vmx_misc;
3849
3850 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low,
3851 vmx->nested.msrs.misc_high);
3852
3853 if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits))
3854 return -EINVAL;
3855
3856 if ((vmx->nested.msrs.pinbased_ctls_high &
3857 PIN_BASED_VMX_PREEMPTION_TIMER) &&
3858 vmx_misc_preemption_timer_rate(data) !=
3859 vmx_misc_preemption_timer_rate(vmx_misc))
3860 return -EINVAL;
3861
3862 if (vmx_misc_cr3_count(data) > vmx_misc_cr3_count(vmx_misc))
3863 return -EINVAL;
3864
3865 if (vmx_misc_max_msr(data) > vmx_misc_max_msr(vmx_misc))
3866 return -EINVAL;
3867
3868 if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc))
3869 return -EINVAL;
3870
3871 vmx->nested.msrs.misc_low = data;
3872 vmx->nested.msrs.misc_high = data >> 32;
3873
3874 /*
3875 * If L1 has read-only VM-exit information fields, use the
3876 * less permissive vmx_vmwrite_bitmap to specify write
3877 * permissions for the shadow VMCS.
3878 */
3879 if (enable_shadow_vmcs && !nested_cpu_has_vmwrite_any_field(&vmx->vcpu))
3880 vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap));
3881
3882 return 0;
3883}
3884
3885static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data)
3886{
3887 u64 vmx_ept_vpid_cap;
3888
3889 vmx_ept_vpid_cap = vmx_control_msr(vmx->nested.msrs.ept_caps,
3890 vmx->nested.msrs.vpid_caps);
3891
3892 /* Every bit is either reserved or a feature bit. */
3893 if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL))
3894 return -EINVAL;
3895
3896 vmx->nested.msrs.ept_caps = data;
3897 vmx->nested.msrs.vpid_caps = data >> 32;
3898 return 0;
3899}
3900
3901static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
3902{
3903 u64 *msr;
3904
3905 switch (msr_index) {
3906 case MSR_IA32_VMX_CR0_FIXED0:
3907 msr = &vmx->nested.msrs.cr0_fixed0;
3908 break;
3909 case MSR_IA32_VMX_CR4_FIXED0:
3910 msr = &vmx->nested.msrs.cr4_fixed0;
3911 break;
3912 default:
3913 BUG();
3914 }
3915
3916 /*
3917 * 1 bits (which indicates bits which "must-be-1" during VMX operation)
3918 * must be 1 in the restored value.
3919 */
3920 if (!is_bitwise_subset(data, *msr, -1ULL))
3921 return -EINVAL;
3922
3923 *msr = data;
3924 return 0;
3925}
3926
3927/*
3928 * Called when userspace is restoring VMX MSRs.
3929 *
3930 * Returns 0 on success, non-0 otherwise.
3931 */
3932static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
3933{
3934 struct vcpu_vmx *vmx = to_vmx(vcpu);
3935
3936 /*
3937 * Don't allow changes to the VMX capability MSRs while the vCPU
3938 * is in VMX operation.
3939 */
3940 if (vmx->nested.vmxon)
3941 return -EBUSY;
3942
3943 switch (msr_index) {
3944 case MSR_IA32_VMX_BASIC:
3945 return vmx_restore_vmx_basic(vmx, data);
3946 case MSR_IA32_VMX_PINBASED_CTLS:
3947 case MSR_IA32_VMX_PROCBASED_CTLS:
3948 case MSR_IA32_VMX_EXIT_CTLS:
3949 case MSR_IA32_VMX_ENTRY_CTLS:
3950 /*
3951 * The "non-true" VMX capability MSRs are generated from the
3952 * "true" MSRs, so we do not support restoring them directly.
3953 *
3954 * If userspace wants to emulate VMX_BASIC[55]=0, userspace
3955 * should restore the "true" MSRs with the must-be-1 bits
3956 * set according to the SDM Vol 3. A.2 "RESERVED CONTROLS AND
3957 * DEFAULT SETTINGS".
3958 */
3959 return -EINVAL;
3960 case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
3961 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
3962 case MSR_IA32_VMX_TRUE_EXIT_CTLS:
3963 case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
3964 case MSR_IA32_VMX_PROCBASED_CTLS2:
3965 return vmx_restore_control_msr(vmx, msr_index, data);
3966 case MSR_IA32_VMX_MISC:
3967 return vmx_restore_vmx_misc(vmx, data);
3968 case MSR_IA32_VMX_CR0_FIXED0:
3969 case MSR_IA32_VMX_CR4_FIXED0:
3970 return vmx_restore_fixed0_msr(vmx, msr_index, data);
3971 case MSR_IA32_VMX_CR0_FIXED1:
3972 case MSR_IA32_VMX_CR4_FIXED1:
3973 /*
3974 * These MSRs are generated based on the vCPU's CPUID, so we
3975 * do not support restoring them directly.
3976 */
3977 return -EINVAL;
3978 case MSR_IA32_VMX_EPT_VPID_CAP:
3979 return vmx_restore_vmx_ept_vpid_cap(vmx, data);
3980 case MSR_IA32_VMX_VMCS_ENUM:
3981 vmx->nested.msrs.vmcs_enum = data;
3982 return 0;
3983 default:
3984 /*
3985 * The rest of the VMX capability MSRs do not support restore.
3986 */
3987 return -EINVAL;
3988 }
3989}
3990
3991/* Returns 0 on success, non-0 otherwise. */
3992static int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata)
3993{
3994 switch (msr_index) {
3995 case MSR_IA32_VMX_BASIC:
3996 *pdata = msrs->basic;
3997 break;
3998 case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
3999 case MSR_IA32_VMX_PINBASED_CTLS:
4000 *pdata = vmx_control_msr(
4001 msrs->pinbased_ctls_low,
4002 msrs->pinbased_ctls_high);
4003 if (msr_index == MSR_IA32_VMX_PINBASED_CTLS)
4004 *pdata |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
4005 break;
4006 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
4007 case MSR_IA32_VMX_PROCBASED_CTLS:
4008 *pdata = vmx_control_msr(
4009 msrs->procbased_ctls_low,
4010 msrs->procbased_ctls_high);
4011 if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS)
4012 *pdata |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
4013 break;
4014 case MSR_IA32_VMX_TRUE_EXIT_CTLS:
4015 case MSR_IA32_VMX_EXIT_CTLS:
4016 *pdata = vmx_control_msr(
4017 msrs->exit_ctls_low,
4018 msrs->exit_ctls_high);
4019 if (msr_index == MSR_IA32_VMX_EXIT_CTLS)
4020 *pdata |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
4021 break;
4022 case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
4023 case MSR_IA32_VMX_ENTRY_CTLS:
4024 *pdata = vmx_control_msr(
4025 msrs->entry_ctls_low,
4026 msrs->entry_ctls_high);
4027 if (msr_index == MSR_IA32_VMX_ENTRY_CTLS)
4028 *pdata |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
4029 break;
4030 case MSR_IA32_VMX_MISC:
4031 *pdata = vmx_control_msr(
4032 msrs->misc_low,
4033 msrs->misc_high);
4034 break;
4035 case MSR_IA32_VMX_CR0_FIXED0:
4036 *pdata = msrs->cr0_fixed0;
4037 break;
4038 case MSR_IA32_VMX_CR0_FIXED1:
4039 *pdata = msrs->cr0_fixed1;
4040 break;
4041 case MSR_IA32_VMX_CR4_FIXED0:
4042 *pdata = msrs->cr4_fixed0;
4043 break;
4044 case MSR_IA32_VMX_CR4_FIXED1:
4045 *pdata = msrs->cr4_fixed1;
4046 break;
4047 case MSR_IA32_VMX_VMCS_ENUM:
4048 *pdata = msrs->vmcs_enum;
4049 break;
4050 case MSR_IA32_VMX_PROCBASED_CTLS2:
4051 *pdata = vmx_control_msr(
4052 msrs->secondary_ctls_low,
4053 msrs->secondary_ctls_high);
4054 break;
4055 case MSR_IA32_VMX_EPT_VPID_CAP:
4056 *pdata = msrs->ept_caps |
4057 ((u64)msrs->vpid_caps << 32);
4058 break;
4059 case MSR_IA32_VMX_VMFUNC:
4060 *pdata = msrs->vmfunc_controls;
4061 break;
4062 default:
4063 return 1;
4064 }
4065
4066 return 0;
4067}
4068
4069static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu,
4070 uint64_t val)
4071{
4072 uint64_t valid_bits = to_vmx(vcpu)->msr_ia32_feature_control_valid_bits;
4073
4074 return !(val & ~valid_bits);
4075}
4076
4077static int vmx_get_msr_feature(struct kvm_msr_entry *msr)
4078{
4079 switch (msr->index) {
4080 case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
4081 if (!nested)
4082 return 1;
4083 return vmx_get_vmx_msr(&vmcs_config.nested, msr->index, &msr->data);
4084 default:
4085 return 1;
4086 }
4087
4088 return 0;
4089}
4090
4091/*
4092 * Reads an msr value (of 'msr_index') into 'pdata'.
4093 * Returns 0 on success, non-0 otherwise.
4094 * Assumes vcpu_load() was already called.
4095 */
4096static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
4097{
4098 struct vcpu_vmx *vmx = to_vmx(vcpu);
4099 struct shared_msr_entry *msr;
4100
4101 switch (msr_info->index) {
4102#ifdef CONFIG_X86_64
4103 case MSR_FS_BASE:
4104 msr_info->data = vmcs_readl(GUEST_FS_BASE);
4105 break;
4106 case MSR_GS_BASE:
4107 msr_info->data = vmcs_readl(GUEST_GS_BASE);
4108 break;
4109 case MSR_KERNEL_GS_BASE:
4110 msr_info->data = vmx_read_guest_kernel_gs_base(vmx);
4111 break;
4112#endif
4113 case MSR_EFER:
4114 return kvm_get_msr_common(vcpu, msr_info);
4115 case MSR_IA32_SPEC_CTRL:
4116 if (!msr_info->host_initiated &&
4117 !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
4118 return 1;
4119
4120 msr_info->data = to_vmx(vcpu)->spec_ctrl;
4121 break;
4122 case MSR_IA32_ARCH_CAPABILITIES:
4123 if (!msr_info->host_initiated &&
4124 !guest_cpuid_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES))
4125 return 1;
4126 msr_info->data = to_vmx(vcpu)->arch_capabilities;
4127 break;
4128 case MSR_IA32_SYSENTER_CS:
4129 msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
4130 break;
4131 case MSR_IA32_SYSENTER_EIP:
4132 msr_info->data = vmcs_readl(GUEST_SYSENTER_EIP);
4133 break;
4134 case MSR_IA32_SYSENTER_ESP:
4135 msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP);
4136 break;
4137 case MSR_IA32_BNDCFGS:
4138 if (!kvm_mpx_supported() ||
4139 (!msr_info->host_initiated &&
4140 !guest_cpuid_has(vcpu, X86_FEATURE_MPX)))
4141 return 1;
4142 msr_info->data = vmcs_read64(GUEST_BNDCFGS);
4143 break;
4144 case MSR_IA32_MCG_EXT_CTL:
4145 if (!msr_info->host_initiated &&
4146 !(vmx->msr_ia32_feature_control &
4147 FEATURE_CONTROL_LMCE))
4148 return 1;
4149 msr_info->data = vcpu->arch.mcg_ext_ctl;
4150 break;
4151 case MSR_IA32_FEATURE_CONTROL:
4152 msr_info->data = vmx->msr_ia32_feature_control;
4153 break;
4154 case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
4155 if (!nested_vmx_allowed(vcpu))
4156 return 1;
4157 return vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index,
4158 &msr_info->data);
4159 case MSR_IA32_XSS:
4160 if (!vmx_xsaves_supported())
4161 return 1;
4162 msr_info->data = vcpu->arch.ia32_xss;
4163 break;
4164 case MSR_TSC_AUX:
4165 if (!msr_info->host_initiated &&
4166 !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
4167 return 1;
4168 /* Otherwise falls through */
4169 default:
4170 msr = find_msr_entry(vmx, msr_info->index);
4171 if (msr) {
4172 msr_info->data = msr->data;
4173 break;
4174 }
4175 return kvm_get_msr_common(vcpu, msr_info);
4176 }
4177
4178 return 0;
4179}
4180
4181static void vmx_leave_nested(struct kvm_vcpu *vcpu);
4182
4183/*
4184 * Writes msr value into into the appropriate "register".
4185 * Returns 0 on success, non-0 otherwise.
4186 * Assumes vcpu_load() was already called.
4187 */
4188static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
4189{
4190 struct vcpu_vmx *vmx = to_vmx(vcpu);
4191 struct shared_msr_entry *msr;
4192 int ret = 0;
4193 u32 msr_index = msr_info->index;
4194 u64 data = msr_info->data;
4195
4196 switch (msr_index) {
4197 case MSR_EFER:
4198 ret = kvm_set_msr_common(vcpu, msr_info);
4199 break;
4200#ifdef CONFIG_X86_64
4201 case MSR_FS_BASE:
4202 vmx_segment_cache_clear(vmx);
4203 vmcs_writel(GUEST_FS_BASE, data);
4204 break;
4205 case MSR_GS_BASE:
4206 vmx_segment_cache_clear(vmx);
4207 vmcs_writel(GUEST_GS_BASE, data);
4208 break;
4209 case MSR_KERNEL_GS_BASE:
4210 vmx_write_guest_kernel_gs_base(vmx, data);
4211 break;
4212#endif
4213 case MSR_IA32_SYSENTER_CS:
4214 vmcs_write32(GUEST_SYSENTER_CS, data);
4215 break;
4216 case MSR_IA32_SYSENTER_EIP:
4217 vmcs_writel(GUEST_SYSENTER_EIP, data);
4218 break;
4219 case MSR_IA32_SYSENTER_ESP:
4220 vmcs_writel(GUEST_SYSENTER_ESP, data);
4221 break;
4222 case MSR_IA32_BNDCFGS:
4223 if (!kvm_mpx_supported() ||
4224 (!msr_info->host_initiated &&
4225 !guest_cpuid_has(vcpu, X86_FEATURE_MPX)))
4226 return 1;
4227 if (is_noncanonical_address(data & PAGE_MASK, vcpu) ||
4228 (data & MSR_IA32_BNDCFGS_RSVD))
4229 return 1;
4230 vmcs_write64(GUEST_BNDCFGS, data);
4231 break;
4232 case MSR_IA32_SPEC_CTRL:
4233 if (!msr_info->host_initiated &&
4234 !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
4235 return 1;
4236
4237 /* The STIBP bit doesn't fault even if it's not advertised */
4238 if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD))
4239 return 1;
4240
4241 vmx->spec_ctrl = data;
4242
4243 if (!data)
4244 break;
4245
4246 /*
4247 * For non-nested:
4248 * When it's written (to non-zero) for the first time, pass
4249 * it through.
4250 *
4251 * For nested:
4252 * The handling of the MSR bitmap for L2 guests is done in
4253 * nested_vmx_merge_msr_bitmap. We should not touch the
4254 * vmcs02.msr_bitmap here since it gets completely overwritten
4255 * in the merging. We update the vmcs01 here for L1 as well
4256 * since it will end up touching the MSR anyway now.
4257 */
4258 vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap,
4259 MSR_IA32_SPEC_CTRL,
4260 MSR_TYPE_RW);
4261 break;
4262 case MSR_IA32_PRED_CMD:
4263 if (!msr_info->host_initiated &&
4264 !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
4265 return 1;
4266
4267 if (data & ~PRED_CMD_IBPB)
4268 return 1;
4269
4270 if (!data)
4271 break;
4272
4273 wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
4274
4275 /*
4276 * For non-nested:
4277 * When it's written (to non-zero) for the first time, pass
4278 * it through.
4279 *
4280 * For nested:
4281 * The handling of the MSR bitmap for L2 guests is done in
4282 * nested_vmx_merge_msr_bitmap. We should not touch the
4283 * vmcs02.msr_bitmap here since it gets completely overwritten
4284 * in the merging.
4285 */
4286 vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, MSR_IA32_PRED_CMD,
4287 MSR_TYPE_W);
4288 break;
4289 case MSR_IA32_ARCH_CAPABILITIES:
4290 if (!msr_info->host_initiated)
4291 return 1;
4292 vmx->arch_capabilities = data;
4293 break;
4294 case MSR_IA32_CR_PAT:
4295 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
4296 if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
4297 return 1;
4298 vmcs_write64(GUEST_IA32_PAT, data);
4299 vcpu->arch.pat = data;
4300 break;
4301 }
4302 ret = kvm_set_msr_common(vcpu, msr_info);
4303 break;
4304 case MSR_IA32_TSC_ADJUST:
4305 ret = kvm_set_msr_common(vcpu, msr_info);
4306 break;
4307 case MSR_IA32_MCG_EXT_CTL:
4308 if ((!msr_info->host_initiated &&
4309 !(to_vmx(vcpu)->msr_ia32_feature_control &
4310 FEATURE_CONTROL_LMCE)) ||
4311 (data & ~MCG_EXT_CTL_LMCE_EN))
4312 return 1;
4313 vcpu->arch.mcg_ext_ctl = data;
4314 break;
4315 case MSR_IA32_FEATURE_CONTROL:
4316 if (!vmx_feature_control_msr_valid(vcpu, data) ||
4317 (to_vmx(vcpu)->msr_ia32_feature_control &
4318 FEATURE_CONTROL_LOCKED && !msr_info->host_initiated))
4319 return 1;
4320 vmx->msr_ia32_feature_control = data;
4321 if (msr_info->host_initiated && data == 0)
4322 vmx_leave_nested(vcpu);
4323 break;
4324 case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
4325 if (!msr_info->host_initiated)
4326 return 1; /* they are read-only */
4327 if (!nested_vmx_allowed(vcpu))
4328 return 1;
4329 return vmx_set_vmx_msr(vcpu, msr_index, data);
4330 case MSR_IA32_XSS:
4331 if (!vmx_xsaves_supported())
4332 return 1;
4333 /*
4334 * The only supported bit as of Skylake is bit 8, but
4335 * it is not supported on KVM.
4336 */
4337 if (data != 0)
4338 return 1;
4339 vcpu->arch.ia32_xss = data;
4340 if (vcpu->arch.ia32_xss != host_xss)
4341 add_atomic_switch_msr(vmx, MSR_IA32_XSS,
4342 vcpu->arch.ia32_xss, host_xss, false);
4343 else
4344 clear_atomic_switch_msr(vmx, MSR_IA32_XSS);
4345 break;
4346 case MSR_TSC_AUX:
4347 if (!msr_info->host_initiated &&
4348 !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
4349 return 1;
4350 /* Check reserved bit, higher 32 bits should be zero */
4351 if ((data >> 32) != 0)
4352 return 1;
4353 /* Otherwise falls through */
4354 default:
4355 msr = find_msr_entry(vmx, msr_index);
4356 if (msr) {
4357 u64 old_msr_data = msr->data;
4358 msr->data = data;
4359 if (msr - vmx->guest_msrs < vmx->save_nmsrs) {
4360 preempt_disable();
4361 ret = kvm_set_shared_msr(msr->index, msr->data,
4362 msr->mask);
4363 preempt_enable();
4364 if (ret)
4365 msr->data = old_msr_data;
4366 }
4367 break;
4368 }
4369 ret = kvm_set_msr_common(vcpu, msr_info);
4370 }
4371
4372 return ret;
4373}
4374
4375static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
4376{
4377 __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
4378 switch (reg) {
4379 case VCPU_REGS_RSP:
4380 vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
4381 break;
4382 case VCPU_REGS_RIP:
4383 vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP);
4384 break;
4385 case VCPU_EXREG_PDPTR:
4386 if (enable_ept)
4387 ept_save_pdptrs(vcpu);
4388 break;
4389 default:
4390 break;
4391 }
4392}
4393
4394static __init int cpu_has_kvm_support(void)
4395{
4396 return cpu_has_vmx();
4397}
4398
4399static __init int vmx_disabled_by_bios(void)
4400{
4401 u64 msr;
4402
4403 rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
4404 if (msr & FEATURE_CONTROL_LOCKED) {
4405 /* launched w/ TXT and VMX disabled */
4406 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX)
4407 && tboot_enabled())
4408 return 1;
4409 /* launched w/o TXT and VMX only enabled w/ TXT */
4410 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
4411 && (msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX)
4412 && !tboot_enabled()) {
4413 printk(KERN_WARNING "kvm: disable TXT in the BIOS or "
4414 "activate TXT before enabling KVM\n");
4415 return 1;
4416 }
4417 /* launched w/o TXT and VMX disabled */
4418 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
4419 && !tboot_enabled())
4420 return 1;
4421 }
4422
4423 return 0;
4424}
4425
4426static void kvm_cpu_vmxon(u64 addr)
4427{
4428 cr4_set_bits(X86_CR4_VMXE);
4429 intel_pt_handle_vmx(1);
4430
4431 asm volatile ("vmxon %0" : : "m"(addr));
4432}
4433
4434static int hardware_enable(void)
4435{
4436 int cpu = raw_smp_processor_id();
4437 u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
4438 u64 old, test_bits;
4439
4440 if (cr4_read_shadow() & X86_CR4_VMXE)
4441 return -EBUSY;
4442
4443 /*
4444 * This can happen if we hot-added a CPU but failed to allocate
4445 * VP assist page for it.
4446 */
4447 if (static_branch_unlikely(&enable_evmcs) &&
4448 !hv_get_vp_assist_page(cpu))
4449 return -EFAULT;
4450
4451 INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
4452 INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu));
4453 spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
4454
4455 /*
4456 * Now we can enable the vmclear operation in kdump
4457 * since the loaded_vmcss_on_cpu list on this cpu
4458 * has been initialized.
4459 *
4460 * Though the cpu is not in VMX operation now, there
4461 * is no problem to enable the vmclear operation
4462 * for the loaded_vmcss_on_cpu list is empty!
4463 */
4464 crash_enable_local_vmclear(cpu);
4465
4466 rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
4467
4468 test_bits = FEATURE_CONTROL_LOCKED;
4469 test_bits |= FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
4470 if (tboot_enabled())
4471 test_bits |= FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX;
4472
4473 if ((old & test_bits) != test_bits) {
4474 /* enable and lock */
4475 wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits);
4476 }
4477 kvm_cpu_vmxon(phys_addr);
4478 if (enable_ept)
4479 ept_sync_global();
4480
4481 return 0;
4482}
4483
4484static void vmclear_local_loaded_vmcss(void)
4485{
4486 int cpu = raw_smp_processor_id();
4487 struct loaded_vmcs *v, *n;
4488
4489 list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu),
4490 loaded_vmcss_on_cpu_link)
4491 __loaded_vmcs_clear(v);
4492}
4493
4494
4495/* Just like cpu_vmxoff(), but with the __kvm_handle_fault_on_reboot()
4496 * tricks.
4497 */
4498static void kvm_cpu_vmxoff(void)
4499{
4500 asm volatile (__ex("vmxoff"));
4501
4502 intel_pt_handle_vmx(0);
4503 cr4_clear_bits(X86_CR4_VMXE);
4504}
4505
4506static void hardware_disable(void)
4507{
4508 vmclear_local_loaded_vmcss();
4509 kvm_cpu_vmxoff();
4510}
4511
4512static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
4513 u32 msr, u32 *result)
4514{
4515 u32 vmx_msr_low, vmx_msr_high;
4516 u32 ctl = ctl_min | ctl_opt;
4517
4518 rdmsr(msr, vmx_msr_low, vmx_msr_high);
4519
4520 ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */
4521 ctl |= vmx_msr_low; /* bit == 1 in low word ==> must be one */
4522
4523 /* Ensure minimum (required) set of control bits are supported. */
4524 if (ctl_min & ~ctl)
4525 return -EIO;
4526
4527 *result = ctl;
4528 return 0;
4529}
4530
4531static __init bool allow_1_setting(u32 msr, u32 ctl)
4532{
4533 u32 vmx_msr_low, vmx_msr_high;
4534
4535 rdmsr(msr, vmx_msr_low, vmx_msr_high);
4536 return vmx_msr_high & ctl;
4537}
4538
4539static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
4540{
4541 u32 vmx_msr_low, vmx_msr_high;
4542 u32 min, opt, min2, opt2;
4543 u32 _pin_based_exec_control = 0;
4544 u32 _cpu_based_exec_control = 0;
4545 u32 _cpu_based_2nd_exec_control = 0;
4546 u32 _vmexit_control = 0;
4547 u32 _vmentry_control = 0;
4548
4549 memset(vmcs_conf, 0, sizeof(*vmcs_conf));
4550 min = CPU_BASED_HLT_EXITING |
4551#ifdef CONFIG_X86_64
4552 CPU_BASED_CR8_LOAD_EXITING |
4553 CPU_BASED_CR8_STORE_EXITING |
4554#endif
4555 CPU_BASED_CR3_LOAD_EXITING |
4556 CPU_BASED_CR3_STORE_EXITING |
4557 CPU_BASED_UNCOND_IO_EXITING |
4558 CPU_BASED_MOV_DR_EXITING |
4559 CPU_BASED_USE_TSC_OFFSETING |
4560 CPU_BASED_MWAIT_EXITING |
4561 CPU_BASED_MONITOR_EXITING |
4562 CPU_BASED_INVLPG_EXITING |
4563 CPU_BASED_RDPMC_EXITING;
4564
4565 opt = CPU_BASED_TPR_SHADOW |
4566 CPU_BASED_USE_MSR_BITMAPS |
4567 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
4568 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
4569 &_cpu_based_exec_control) < 0)
4570 return -EIO;
4571#ifdef CONFIG_X86_64
4572 if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
4573 _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING &
4574 ~CPU_BASED_CR8_STORE_EXITING;
4575#endif
4576 if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
4577 min2 = 0;
4578 opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
4579 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
4580 SECONDARY_EXEC_WBINVD_EXITING |
4581 SECONDARY_EXEC_ENABLE_VPID |
4582 SECONDARY_EXEC_ENABLE_EPT |
4583 SECONDARY_EXEC_UNRESTRICTED_GUEST |
4584 SECONDARY_EXEC_PAUSE_LOOP_EXITING |
4585 SECONDARY_EXEC_DESC |
4586 SECONDARY_EXEC_RDTSCP |
4587 SECONDARY_EXEC_ENABLE_INVPCID |
4588 SECONDARY_EXEC_APIC_REGISTER_VIRT |
4589 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
4590 SECONDARY_EXEC_SHADOW_VMCS |
4591 SECONDARY_EXEC_XSAVES |
4592 SECONDARY_EXEC_RDSEED_EXITING |
4593 SECONDARY_EXEC_RDRAND_EXITING |
4594 SECONDARY_EXEC_ENABLE_PML |
4595 SECONDARY_EXEC_TSC_SCALING |
4596 SECONDARY_EXEC_ENABLE_VMFUNC |
4597 SECONDARY_EXEC_ENCLS_EXITING;
4598 if (adjust_vmx_controls(min2, opt2,
4599 MSR_IA32_VMX_PROCBASED_CTLS2,
4600 &_cpu_based_2nd_exec_control) < 0)
4601 return -EIO;
4602 }
4603#ifndef CONFIG_X86_64
4604 if (!(_cpu_based_2nd_exec_control &
4605 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
4606 _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
4607#endif
4608
4609 if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
4610 _cpu_based_2nd_exec_control &= ~(
4611 SECONDARY_EXEC_APIC_REGISTER_VIRT |
4612 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
4613 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
4614
4615 rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP,
4616 &vmx_capability.ept, &vmx_capability.vpid);
4617
4618 if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) {
4619 /* CR3 accesses and invlpg don't need to cause VM Exits when EPT
4620 enabled */
4621 _cpu_based_exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING |
4622 CPU_BASED_CR3_STORE_EXITING |
4623 CPU_BASED_INVLPG_EXITING);
4624 } else if (vmx_capability.ept) {
4625 vmx_capability.ept = 0;
4626 pr_warn_once("EPT CAP should not exist if not support "
4627 "1-setting enable EPT VM-execution control\n");
4628 }
4629 if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) &&
4630 vmx_capability.vpid) {
4631 vmx_capability.vpid = 0;
4632 pr_warn_once("VPID CAP should not exist if not support "
4633 "1-setting enable VPID VM-execution control\n");
4634 }
4635
4636 min = VM_EXIT_SAVE_DEBUG_CONTROLS | VM_EXIT_ACK_INTR_ON_EXIT;
4637#ifdef CONFIG_X86_64
4638 min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
4639#endif
4640 opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT |
4641 VM_EXIT_CLEAR_BNDCFGS;
4642 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
4643 &_vmexit_control) < 0)
4644 return -EIO;
4645
4646 min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
4647 opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR |
4648 PIN_BASED_VMX_PREEMPTION_TIMER;
4649 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
4650 &_pin_based_exec_control) < 0)
4651 return -EIO;
4652
4653 if (cpu_has_broken_vmx_preemption_timer())
4654 _pin_based_exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
4655 if (!(_cpu_based_2nd_exec_control &
4656 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY))
4657 _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR;
4658
4659 min = VM_ENTRY_LOAD_DEBUG_CONTROLS;
4660 opt = VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS;
4661 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
4662 &_vmentry_control) < 0)
4663 return -EIO;
4664
4665 rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high);
4666
4667 /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
4668 if ((vmx_msr_high & 0x1fff) > PAGE_SIZE)
4669 return -EIO;
4670
4671#ifdef CONFIG_X86_64
4672 /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */
4673 if (vmx_msr_high & (1u<<16))
4674 return -EIO;
4675#endif
4676
4677 /* Require Write-Back (WB) memory type for VMCS accesses. */
4678 if (((vmx_msr_high >> 18) & 15) != 6)
4679 return -EIO;
4680
4681 vmcs_conf->size = vmx_msr_high & 0x1fff;
4682 vmcs_conf->order = get_order(vmcs_conf->size);
4683 vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff;
4684
4685 vmcs_conf->revision_id = vmx_msr_low;
4686
4687 vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
4688 vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
4689 vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
4690 vmcs_conf->vmexit_ctrl = _vmexit_control;
4691 vmcs_conf->vmentry_ctrl = _vmentry_control;
4692
4693 if (static_branch_unlikely(&enable_evmcs))
4694 evmcs_sanitize_exec_ctrls(vmcs_conf);
4695
4696 cpu_has_load_ia32_efer =
4697 allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS,
4698 VM_ENTRY_LOAD_IA32_EFER)
4699 && allow_1_setting(MSR_IA32_VMX_EXIT_CTLS,
4700 VM_EXIT_LOAD_IA32_EFER);
4701
4702 cpu_has_load_perf_global_ctrl =
4703 allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS,
4704 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
4705 && allow_1_setting(MSR_IA32_VMX_EXIT_CTLS,
4706 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL);
4707
4708 /*
4709 * Some cpus support VM_ENTRY_(LOAD|SAVE)_IA32_PERF_GLOBAL_CTRL
4710 * but due to errata below it can't be used. Workaround is to use
4711 * msr load mechanism to switch IA32_PERF_GLOBAL_CTRL.
4712 *
4713 * VM Exit May Incorrectly Clear IA32_PERF_GLOBAL_CTRL [34:32]
4714 *
4715 * AAK155 (model 26)
4716 * AAP115 (model 30)
4717 * AAT100 (model 37)
4718 * BC86,AAY89,BD102 (model 44)
4719 * BA97 (model 46)
4720 *
4721 */
4722 if (cpu_has_load_perf_global_ctrl && boot_cpu_data.x86 == 0x6) {
4723 switch (boot_cpu_data.x86_model) {
4724 case 26:
4725 case 30:
4726 case 37:
4727 case 44:
4728 case 46:
4729 cpu_has_load_perf_global_ctrl = false;
4730 printk_once(KERN_WARNING"kvm: VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL "
4731 "does not work properly. Using workaround\n");
4732 break;
4733 default:
4734 break;
4735 }
4736 }
4737
4738 if (boot_cpu_has(X86_FEATURE_XSAVES))
4739 rdmsrl(MSR_IA32_XSS, host_xss);
4740
4741 return 0;
4742}
4743
4744static struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu)
4745{
4746 int node = cpu_to_node(cpu);
4747 struct page *pages;
4748 struct vmcs *vmcs;
4749
4750 pages = __alloc_pages_node(node, GFP_KERNEL, vmcs_config.order);
4751 if (!pages)
4752 return NULL;
4753 vmcs = page_address(pages);
4754 memset(vmcs, 0, vmcs_config.size);
4755
4756 /* KVM supports Enlightened VMCS v1 only */
4757 if (static_branch_unlikely(&enable_evmcs))
4758 vmcs->hdr.revision_id = KVM_EVMCS_VERSION;
4759 else
4760 vmcs->hdr.revision_id = vmcs_config.revision_id;
4761
4762 if (shadow)
4763 vmcs->hdr.shadow_vmcs = 1;
4764 return vmcs;
4765}
4766
4767static void free_vmcs(struct vmcs *vmcs)
4768{
4769 free_pages((unsigned long)vmcs, vmcs_config.order);
4770}
4771
4772/*
4773 * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded
4774 */
4775static void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
4776{
4777 if (!loaded_vmcs->vmcs)
4778 return;
4779 loaded_vmcs_clear(loaded_vmcs);
4780 free_vmcs(loaded_vmcs->vmcs);
4781 loaded_vmcs->vmcs = NULL;
4782 if (loaded_vmcs->msr_bitmap)
4783 free_page((unsigned long)loaded_vmcs->msr_bitmap);
4784 WARN_ON(loaded_vmcs->shadow_vmcs != NULL);
4785}
4786
4787static struct vmcs *alloc_vmcs(bool shadow)
4788{
4789 return alloc_vmcs_cpu(shadow, raw_smp_processor_id());
4790}
4791
4792static int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
4793{
4794 loaded_vmcs->vmcs = alloc_vmcs(false);
4795 if (!loaded_vmcs->vmcs)
4796 return -ENOMEM;
4797
4798 loaded_vmcs->shadow_vmcs = NULL;
4799 loaded_vmcs_init(loaded_vmcs);
4800
4801 if (cpu_has_vmx_msr_bitmap()) {
4802 loaded_vmcs->msr_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
4803 if (!loaded_vmcs->msr_bitmap)
4804 goto out_vmcs;
4805 memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE);
4806
4807 if (IS_ENABLED(CONFIG_HYPERV) &&
4808 static_branch_unlikely(&enable_evmcs) &&
4809 (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) {
4810 struct hv_enlightened_vmcs *evmcs =
4811 (struct hv_enlightened_vmcs *)loaded_vmcs->vmcs;
4812
4813 evmcs->hv_enlightenments_control.msr_bitmap = 1;
4814 }
4815 }
4816
4817 memset(&loaded_vmcs->host_state, 0, sizeof(struct vmcs_host_state));
4818
4819 return 0;
4820
4821out_vmcs:
4822 free_loaded_vmcs(loaded_vmcs);
4823 return -ENOMEM;
4824}
4825
4826static void free_kvm_area(void)
4827{
4828 int cpu;
4829
4830 for_each_possible_cpu(cpu) {
4831 free_vmcs(per_cpu(vmxarea, cpu));
4832 per_cpu(vmxarea, cpu) = NULL;
4833 }
4834}
4835
4836enum vmcs_field_width {
4837 VMCS_FIELD_WIDTH_U16 = 0,
4838 VMCS_FIELD_WIDTH_U64 = 1,
4839 VMCS_FIELD_WIDTH_U32 = 2,
4840 VMCS_FIELD_WIDTH_NATURAL_WIDTH = 3
4841};
4842
4843static inline int vmcs_field_width(unsigned long field)
4844{
4845 if (0x1 & field) /* the *_HIGH fields are all 32 bit */
4846 return VMCS_FIELD_WIDTH_U32;
4847 return (field >> 13) & 0x3 ;
4848}
4849
4850static inline int vmcs_field_readonly(unsigned long field)
4851{
4852 return (((field >> 10) & 0x3) == 1);
4853}
4854
4855static void init_vmcs_shadow_fields(void)
4856{
4857 int i, j;
4858
4859 for (i = j = 0; i < max_shadow_read_only_fields; i++) {
4860 u16 field = shadow_read_only_fields[i];
4861 if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 &&
4862 (i + 1 == max_shadow_read_only_fields ||
4863 shadow_read_only_fields[i + 1] != field + 1))
4864 pr_err("Missing field from shadow_read_only_field %x\n",
4865 field + 1);
4866
4867 clear_bit(field, vmx_vmread_bitmap);
4868#ifdef CONFIG_X86_64
4869 if (field & 1)
4870 continue;
4871#endif
4872 if (j < i)
4873 shadow_read_only_fields[j] = field;
4874 j++;
4875 }
4876 max_shadow_read_only_fields = j;
4877
4878 for (i = j = 0; i < max_shadow_read_write_fields; i++) {
4879 u16 field = shadow_read_write_fields[i];
4880 if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 &&
4881 (i + 1 == max_shadow_read_write_fields ||
4882 shadow_read_write_fields[i + 1] != field + 1))
4883 pr_err("Missing field from shadow_read_write_field %x\n",
4884 field + 1);
4885
4886 /*
4887 * PML and the preemption timer can be emulated, but the
4888 * processor cannot vmwrite to fields that don't exist
4889 * on bare metal.
4890 */
4891 switch (field) {
4892 case GUEST_PML_INDEX:
4893 if (!cpu_has_vmx_pml())
4894 continue;
4895 break;
4896 case VMX_PREEMPTION_TIMER_VALUE:
4897 if (!cpu_has_vmx_preemption_timer())
4898 continue;
4899 break;
4900 case GUEST_INTR_STATUS:
4901 if (!cpu_has_vmx_apicv())
4902 continue;
4903 break;
4904 default:
4905 break;
4906 }
4907
4908 clear_bit(field, vmx_vmwrite_bitmap);
4909 clear_bit(field, vmx_vmread_bitmap);
4910#ifdef CONFIG_X86_64
4911 if (field & 1)
4912 continue;
4913#endif
4914 if (j < i)
4915 shadow_read_write_fields[j] = field;
4916 j++;
4917 }
4918 max_shadow_read_write_fields = j;
4919}
4920
4921static __init int alloc_kvm_area(void)
4922{
4923 int cpu;
4924
4925 for_each_possible_cpu(cpu) {
4926 struct vmcs *vmcs;
4927
4928 vmcs = alloc_vmcs_cpu(false, cpu);
4929 if (!vmcs) {
4930 free_kvm_area();
4931 return -ENOMEM;
4932 }
4933
4934 /*
4935 * When eVMCS is enabled, alloc_vmcs_cpu() sets
4936 * vmcs->revision_id to KVM_EVMCS_VERSION instead of
4937 * revision_id reported by MSR_IA32_VMX_BASIC.
4938 *
4939 * However, even though not explictly documented by
4940 * TLFS, VMXArea passed as VMXON argument should
4941 * still be marked with revision_id reported by
4942 * physical CPU.
4943 */
4944 if (static_branch_unlikely(&enable_evmcs))
4945 vmcs->hdr.revision_id = vmcs_config.revision_id;
4946
4947 per_cpu(vmxarea, cpu) = vmcs;
4948 }
4949 return 0;
4950}
4951
4952static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg,
4953 struct kvm_segment *save)
4954{
4955 if (!emulate_invalid_guest_state) {
4956 /*
4957 * CS and SS RPL should be equal during guest entry according
4958 * to VMX spec, but in reality it is not always so. Since vcpu
4959 * is in the middle of the transition from real mode to
4960 * protected mode it is safe to assume that RPL 0 is a good
4961 * default value.
4962 */
4963 if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS)
4964 save->selector &= ~SEGMENT_RPL_MASK;
4965 save->dpl = save->selector & SEGMENT_RPL_MASK;
4966 save->s = 1;
4967 }
4968 vmx_set_segment(vcpu, save, seg);
4969}
4970
4971static void enter_pmode(struct kvm_vcpu *vcpu)
4972{
4973 unsigned long flags;
4974 struct vcpu_vmx *vmx = to_vmx(vcpu);
4975
4976 /*
4977 * Update real mode segment cache. It may be not up-to-date if sement
4978 * register was written while vcpu was in a guest mode.
4979 */
4980 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
4981 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
4982 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
4983 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
4984 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
4985 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);
4986
4987 vmx->rmode.vm86_active = 0;
4988
4989 vmx_segment_cache_clear(vmx);
4990
4991 vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
4992
4993 flags = vmcs_readl(GUEST_RFLAGS);
4994 flags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
4995 flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
4996 vmcs_writel(GUEST_RFLAGS, flags);
4997
4998 vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |
4999 (vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME));
5000
5001 update_exception_bitmap(vcpu);
5002
5003 fix_pmode_seg(vcpu, VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
5004 fix_pmode_seg(vcpu, VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
5005 fix_pmode_seg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
5006 fix_pmode_seg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
5007 fix_pmode_seg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
5008 fix_pmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
5009}
5010
5011static void fix_rmode_seg(int seg, struct kvm_segment *save)
5012{
5013 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
5014 struct kvm_segment var = *save;
5015
5016 var.dpl = 0x3;
5017 if (seg == VCPU_SREG_CS)
5018 var.type = 0x3;
5019
5020 if (!emulate_invalid_guest_state) {
5021 var.selector = var.base >> 4;
5022 var.base = var.base & 0xffff0;
5023 var.limit = 0xffff;
5024 var.g = 0;
5025 var.db = 0;
5026 var.present = 1;
5027 var.s = 1;
5028 var.l = 0;
5029 var.unusable = 0;
5030 var.type = 0x3;
5031 var.avl = 0;
5032 if (save->base & 0xf)
5033 printk_once(KERN_WARNING "kvm: segment base is not "
5034 "paragraph aligned when entering "
5035 "protected mode (seg=%d)", seg);
5036 }
5037
5038 vmcs_write16(sf->selector, var.selector);
5039 vmcs_writel(sf->base, var.base);
5040 vmcs_write32(sf->limit, var.limit);
5041 vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(&var));
5042}
5043
5044static void enter_rmode(struct kvm_vcpu *vcpu)
5045{
5046 unsigned long flags;
5047 struct vcpu_vmx *vmx = to_vmx(vcpu);
5048 struct kvm_vmx *kvm_vmx = to_kvm_vmx(vcpu->kvm);
5049
5050 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
5051 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
5052 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
5053 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
5054 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
5055 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
5056 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);
5057
5058 vmx->rmode.vm86_active = 1;
5059
5060 /*
5061 * Very old userspace does not call KVM_SET_TSS_ADDR before entering
5062 * vcpu. Warn the user that an update is overdue.
5063 */
5064 if (!kvm_vmx->tss_addr)
5065 printk_once(KERN_WARNING "kvm: KVM_SET_TSS_ADDR need to be "
5066 "called before entering vcpu\n");
5067
5068 vmx_segment_cache_clear(vmx);
5069
5070 vmcs_writel(GUEST_TR_BASE, kvm_vmx->tss_addr);
5071 vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
5072 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
5073
5074 flags = vmcs_readl(GUEST_RFLAGS);
5075 vmx->rmode.save_rflags = flags;
5076
5077 flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
5078
5079 vmcs_writel(GUEST_RFLAGS, flags);
5080 vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME);
5081 update_exception_bitmap(vcpu);
5082
5083 fix_rmode_seg(VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
5084 fix_rmode_seg(VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
5085 fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
5086 fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
5087 fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
5088 fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
5089
5090 kvm_mmu_reset_context(vcpu);
5091}
5092
5093static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
5094{
5095 struct vcpu_vmx *vmx = to_vmx(vcpu);
5096 struct shared_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
5097
5098 if (!msr)
5099 return;
5100
5101 vcpu->arch.efer = efer;
5102 if (efer & EFER_LMA) {
5103 vm_entry_controls_setbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
5104 msr->data = efer;
5105 } else {
5106 vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
5107
5108 msr->data = efer & ~EFER_LME;
5109 }
5110 setup_msrs(vmx);
5111}
5112
5113#ifdef CONFIG_X86_64
5114
5115static void enter_lmode(struct kvm_vcpu *vcpu)
5116{
5117 u32 guest_tr_ar;
5118
5119 vmx_segment_cache_clear(to_vmx(vcpu));
5120
5121 guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
5122 if ((guest_tr_ar & VMX_AR_TYPE_MASK) != VMX_AR_TYPE_BUSY_64_TSS) {
5123 pr_debug_ratelimited("%s: tss fixup for long mode. \n",
5124 __func__);
5125 vmcs_write32(GUEST_TR_AR_BYTES,
5126 (guest_tr_ar & ~VMX_AR_TYPE_MASK)
5127 | VMX_AR_TYPE_BUSY_64_TSS);
5128 }
5129 vmx_set_efer(vcpu, vcpu->arch.efer | EFER_LMA);
5130}
5131
5132static void exit_lmode(struct kvm_vcpu *vcpu)
5133{
5134 vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
5135 vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA);
5136}
5137
5138#endif
5139
5140static inline void __vmx_flush_tlb(struct kvm_vcpu *vcpu, int vpid,
5141 bool invalidate_gpa)
5142{
5143 if (enable_ept && (invalidate_gpa || !enable_vpid)) {
5144 if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
5145 return;
5146 ept_sync_context(construct_eptp(vcpu,
5147 vcpu->arch.mmu->root_hpa));
5148 } else {
5149 vpid_sync_context(vpid);
5150 }
5151}
5152
5153static void vmx_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa)
5154{
5155 __vmx_flush_tlb(vcpu, to_vmx(vcpu)->vpid, invalidate_gpa);
5156}
5157
5158static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr)
5159{
5160 int vpid = to_vmx(vcpu)->vpid;
5161
5162 if (!vpid_sync_vcpu_addr(vpid, addr))
5163 vpid_sync_context(vpid);
5164
5165 /*
5166 * If VPIDs are not supported or enabled, then the above is a no-op.
5167 * But we don't really need a TLB flush in that case anyway, because
5168 * each VM entry/exit includes an implicit flush when VPID is 0.
5169 */
5170}
5171
5172static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
5173{
5174 ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;
5175
5176 vcpu->arch.cr0 &= ~cr0_guest_owned_bits;
5177 vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits;
5178}
5179
5180static void vmx_decache_cr3(struct kvm_vcpu *vcpu)
5181{
5182 if (enable_unrestricted_guest || (enable_ept && is_paging(vcpu)))
5183 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
5184 __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
5185}
5186
5187static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
5188{
5189 ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits;
5190
5191 vcpu->arch.cr4 &= ~cr4_guest_owned_bits;
5192 vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & cr4_guest_owned_bits;
5193}
5194
5195static void ept_load_pdptrs(struct kvm_vcpu *vcpu)
5196{
5197 struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
5198
5199 if (!test_bit(VCPU_EXREG_PDPTR,
5200 (unsigned long *)&vcpu->arch.regs_dirty))
5201 return;
5202
5203 if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
5204 vmcs_write64(GUEST_PDPTR0, mmu->pdptrs[0]);
5205 vmcs_write64(GUEST_PDPTR1, mmu->pdptrs[1]);
5206 vmcs_write64(GUEST_PDPTR2, mmu->pdptrs[2]);
5207 vmcs_write64(GUEST_PDPTR3, mmu->pdptrs[3]);
5208 }
5209}
5210
5211static void ept_save_pdptrs(struct kvm_vcpu *vcpu)
5212{
5213 struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
5214
5215 if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
5216 mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0);
5217 mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1);
5218 mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2);
5219 mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3);
5220 }
5221
5222 __set_bit(VCPU_EXREG_PDPTR,
5223 (unsigned long *)&vcpu->arch.regs_avail);
5224 __set_bit(VCPU_EXREG_PDPTR,
5225 (unsigned long *)&vcpu->arch.regs_dirty);
5226}
5227
5228static bool nested_guest_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val)
5229{
5230 u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr0_fixed0;
5231 u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr0_fixed1;
5232 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
5233
5234 if (to_vmx(vcpu)->nested.msrs.secondary_ctls_high &
5235 SECONDARY_EXEC_UNRESTRICTED_GUEST &&
5236 nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST))
5237 fixed0 &= ~(X86_CR0_PE | X86_CR0_PG);
5238
5239 return fixed_bits_valid(val, fixed0, fixed1);
5240}
5241
5242static bool nested_host_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val)
5243{
5244 u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr0_fixed0;
5245 u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr0_fixed1;
5246
5247 return fixed_bits_valid(val, fixed0, fixed1);
5248}
5249
5250static bool nested_cr4_valid(struct kvm_vcpu *vcpu, unsigned long val)
5251{
5252 u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr4_fixed0;
5253 u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr4_fixed1;
5254
5255 return fixed_bits_valid(val, fixed0, fixed1);
5256}
5257
5258/* No difference in the restrictions on guest and host CR4 in VMX operation. */
5259#define nested_guest_cr4_valid nested_cr4_valid
5260#define nested_host_cr4_valid nested_cr4_valid
5261
5262static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
5263
5264static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
5265 unsigned long cr0,
5266 struct kvm_vcpu *vcpu)
5267{
5268 if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
5269 vmx_decache_cr3(vcpu);
5270 if (!(cr0 & X86_CR0_PG)) {
5271 /* From paging/starting to nonpaging */
5272 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
5273 vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) |
5274 (CPU_BASED_CR3_LOAD_EXITING |
5275 CPU_BASED_CR3_STORE_EXITING));
5276 vcpu->arch.cr0 = cr0;
5277 vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
5278 } else if (!is_paging(vcpu)) {
5279 /* From nonpaging to paging */
5280 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
5281 vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) &
5282 ~(CPU_BASED_CR3_LOAD_EXITING |
5283 CPU_BASED_CR3_STORE_EXITING));
5284 vcpu->arch.cr0 = cr0;
5285 vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
5286 }
5287
5288 if (!(cr0 & X86_CR0_WP))
5289 *hw_cr0 &= ~X86_CR0_WP;
5290}
5291
5292static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
5293{
5294 struct vcpu_vmx *vmx = to_vmx(vcpu);
5295 unsigned long hw_cr0;
5296
5297 hw_cr0 = (cr0 & ~KVM_VM_CR0_ALWAYS_OFF);
5298 if (enable_unrestricted_guest)
5299 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST;
5300 else {
5301 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON;
5302
5303 if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE))
5304 enter_pmode(vcpu);
5305
5306 if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE))
5307 enter_rmode(vcpu);
5308 }
5309
5310#ifdef CONFIG_X86_64
5311 if (vcpu->arch.efer & EFER_LME) {
5312 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG))
5313 enter_lmode(vcpu);
5314 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG))
5315 exit_lmode(vcpu);
5316 }
5317#endif
5318
5319 if (enable_ept && !enable_unrestricted_guest)
5320 ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu);
5321
5322 vmcs_writel(CR0_READ_SHADOW, cr0);
5323 vmcs_writel(GUEST_CR0, hw_cr0);
5324 vcpu->arch.cr0 = cr0;
5325
5326 /* depends on vcpu->arch.cr0 to be set to a new value */
5327 vmx->emulation_required = emulation_required(vcpu);
5328}
5329
5330static int get_ept_level(struct kvm_vcpu *vcpu)
5331{
5332 if (cpu_has_vmx_ept_5levels() && (cpuid_maxphyaddr(vcpu) > 48))
5333 return 5;
5334 return 4;
5335}
5336
5337static u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa)
5338{
5339 u64 eptp = VMX_EPTP_MT_WB;
5340
5341 eptp |= (get_ept_level(vcpu) == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4;
5342
5343 if (enable_ept_ad_bits &&
5344 (!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu)))
5345 eptp |= VMX_EPTP_AD_ENABLE_BIT;
5346 eptp |= (root_hpa & PAGE_MASK);
5347
5348 return eptp;
5349}
5350
5351static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
5352{
5353 struct kvm *kvm = vcpu->kvm;
5354 unsigned long guest_cr3;
5355 u64 eptp;
5356
5357 guest_cr3 = cr3;
5358 if (enable_ept) {
5359 eptp = construct_eptp(vcpu, cr3);
5360 vmcs_write64(EPT_POINTER, eptp);
5361
5362 if (kvm_x86_ops->tlb_remote_flush) {
5363 spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock);
5364 to_vmx(vcpu)->ept_pointer = eptp;
5365 to_kvm_vmx(kvm)->ept_pointers_match
5366 = EPT_POINTERS_CHECK;
5367 spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock);
5368 }
5369
5370 if (enable_unrestricted_guest || is_paging(vcpu) ||
5371 is_guest_mode(vcpu))
5372 guest_cr3 = kvm_read_cr3(vcpu);
5373 else
5374 guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr;
5375 ept_load_pdptrs(vcpu);
5376 }
5377
5378 vmcs_writel(GUEST_CR3, guest_cr3);
5379}
5380
5381static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
5382{
5383 /*
5384 * Pass through host's Machine Check Enable value to hw_cr4, which
5385 * is in force while we are in guest mode. Do not let guests control
5386 * this bit, even if host CR4.MCE == 0.
5387 */
5388 unsigned long hw_cr4;
5389
5390 hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE);
5391 if (enable_unrestricted_guest)
5392 hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST;
5393 else if (to_vmx(vcpu)->rmode.vm86_active)
5394 hw_cr4 |= KVM_RMODE_VM_CR4_ALWAYS_ON;
5395 else
5396 hw_cr4 |= KVM_PMODE_VM_CR4_ALWAYS_ON;
5397
5398 if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated()) {
5399 if (cr4 & X86_CR4_UMIP) {
5400 vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
5401 SECONDARY_EXEC_DESC);
5402 hw_cr4 &= ~X86_CR4_UMIP;
5403 } else if (!is_guest_mode(vcpu) ||
5404 !nested_cpu_has2(get_vmcs12(vcpu), SECONDARY_EXEC_DESC))
5405 vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL,
5406 SECONDARY_EXEC_DESC);
5407 }
5408
5409 if (cr4 & X86_CR4_VMXE) {
5410 /*
5411 * To use VMXON (and later other VMX instructions), a guest
5412 * must first be able to turn on cr4.VMXE (see handle_vmon()).
5413 * So basically the check on whether to allow nested VMX
5414 * is here. We operate under the default treatment of SMM,
5415 * so VMX cannot be enabled under SMM.
5416 */
5417 if (!nested_vmx_allowed(vcpu) || is_smm(vcpu))
5418 return 1;
5419 }
5420
5421 if (to_vmx(vcpu)->nested.vmxon && !nested_cr4_valid(vcpu, cr4))
5422 return 1;
5423
5424 vcpu->arch.cr4 = cr4;
5425
5426 if (!enable_unrestricted_guest) {
5427 if (enable_ept) {
5428 if (!is_paging(vcpu)) {
5429 hw_cr4 &= ~X86_CR4_PAE;
5430 hw_cr4 |= X86_CR4_PSE;
5431 } else if (!(cr4 & X86_CR4_PAE)) {
5432 hw_cr4 &= ~X86_CR4_PAE;
5433 }
5434 }
5435
5436 /*
5437 * SMEP/SMAP/PKU is disabled if CPU is in non-paging mode in
5438 * hardware. To emulate this behavior, SMEP/SMAP/PKU needs
5439 * to be manually disabled when guest switches to non-paging
5440 * mode.
5441 *
5442 * If !enable_unrestricted_guest, the CPU is always running
5443 * with CR0.PG=1 and CR4 needs to be modified.
5444 * If enable_unrestricted_guest, the CPU automatically
5445 * disables SMEP/SMAP/PKU when the guest sets CR0.PG=0.
5446 */
5447 if (!is_paging(vcpu))
5448 hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE);
5449 }
5450
5451 vmcs_writel(CR4_READ_SHADOW, cr4);
5452 vmcs_writel(GUEST_CR4, hw_cr4);
5453 return 0;
5454}
5455
5456static void vmx_get_segment(struct kvm_vcpu *vcpu,
5457 struct kvm_segment *var, int seg)
5458{
5459 struct vcpu_vmx *vmx = to_vmx(vcpu);
5460 u32 ar;
5461
5462 if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
5463 *var = vmx->rmode.segs[seg];
5464 if (seg == VCPU_SREG_TR
5465 || var->selector == vmx_read_guest_seg_selector(vmx, seg))
5466 return;
5467 var->base = vmx_read_guest_seg_base(vmx, seg);
5468 var->selector = vmx_read_guest_seg_selector(vmx, seg);
5469 return;
5470 }
5471 var->base = vmx_read_guest_seg_base(vmx, seg);
5472 var->limit = vmx_read_guest_seg_limit(vmx, seg);
5473 var->selector = vmx_read_guest_seg_selector(vmx, seg);
5474 ar = vmx_read_guest_seg_ar(vmx, seg);
5475 var->unusable = (ar >> 16) & 1;
5476 var->type = ar & 15;
5477 var->s = (ar >> 4) & 1;
5478 var->dpl = (ar >> 5) & 3;
5479 /*
5480 * Some userspaces do not preserve unusable property. Since usable
5481 * segment has to be present according to VMX spec we can use present
5482 * property to amend userspace bug by making unusable segment always
5483 * nonpresent. vmx_segment_access_rights() already marks nonpresent
5484 * segment as unusable.
5485 */
5486 var->present = !var->unusable;
5487 var->avl = (ar >> 12) & 1;
5488 var->l = (ar >> 13) & 1;
5489 var->db = (ar >> 14) & 1;
5490 var->g = (ar >> 15) & 1;
5491}
5492
5493static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
5494{
5495 struct kvm_segment s;
5496
5497 if (to_vmx(vcpu)->rmode.vm86_active) {
5498 vmx_get_segment(vcpu, &s, seg);
5499 return s.base;
5500 }
5501 return vmx_read_guest_seg_base(to_vmx(vcpu), seg);
5502}
5503
5504static int vmx_get_cpl(struct kvm_vcpu *vcpu)
5505{
5506 struct vcpu_vmx *vmx = to_vmx(vcpu);
5507
5508 if (unlikely(vmx->rmode.vm86_active))
5509 return 0;
5510 else {
5511 int ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS);
5512 return VMX_AR_DPL(ar);
5513 }
5514}
5515
5516static u32 vmx_segment_access_rights(struct kvm_segment *var)
5517{
5518 u32 ar;
5519
5520 if (var->unusable || !var->present)
5521 ar = 1 << 16;
5522 else {
5523 ar = var->type & 15;
5524 ar |= (var->s & 1) << 4;
5525 ar |= (var->dpl & 3) << 5;
5526 ar |= (var->present & 1) << 7;
5527 ar |= (var->avl & 1) << 12;
5528 ar |= (var->l & 1) << 13;
5529 ar |= (var->db & 1) << 14;
5530 ar |= (var->g & 1) << 15;
5531 }
5532
5533 return ar;
5534}
5535
5536static void vmx_set_segment(struct kvm_vcpu *vcpu,
5537 struct kvm_segment *var, int seg)
5538{
5539 struct vcpu_vmx *vmx = to_vmx(vcpu);
5540 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
5541
5542 vmx_segment_cache_clear(vmx);
5543
5544 if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
5545 vmx->rmode.segs[seg] = *var;
5546 if (seg == VCPU_SREG_TR)
5547 vmcs_write16(sf->selector, var->selector);
5548 else if (var->s)
5549 fix_rmode_seg(seg, &vmx->rmode.segs[seg]);
5550 goto out;
5551 }
5552
5553 vmcs_writel(sf->base, var->base);
5554 vmcs_write32(sf->limit, var->limit);
5555 vmcs_write16(sf->selector, var->selector);
5556
5557 /*
5558 * Fix the "Accessed" bit in AR field of segment registers for older
5559 * qemu binaries.
5560 * IA32 arch specifies that at the time of processor reset the
5561 * "Accessed" bit in the AR field of segment registers is 1. And qemu
5562 * is setting it to 0 in the userland code. This causes invalid guest
5563 * state vmexit when "unrestricted guest" mode is turned on.
5564 * Fix for this setup issue in cpu_reset is being pushed in the qemu
5565 * tree. Newer qemu binaries with that qemu fix would not need this
5566 * kvm hack.
5567 */
5568 if (enable_unrestricted_guest && (seg != VCPU_SREG_LDTR))
5569 var->type |= 0x1; /* Accessed */
5570
5571 vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var));
5572
5573out:
5574 vmx->emulation_required = emulation_required(vcpu);
5575}
5576
5577static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
5578{
5579 u32 ar = vmx_read_guest_seg_ar(to_vmx(vcpu), VCPU_SREG_CS);
5580
5581 *db = (ar >> 14) & 1;
5582 *l = (ar >> 13) & 1;
5583}
5584
5585static void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
5586{
5587 dt->size = vmcs_read32(GUEST_IDTR_LIMIT);
5588 dt->address = vmcs_readl(GUEST_IDTR_BASE);
5589}
5590
5591static void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
5592{
5593 vmcs_write32(GUEST_IDTR_LIMIT, dt->size);
5594 vmcs_writel(GUEST_IDTR_BASE, dt->address);
5595}
5596
5597static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
5598{
5599 dt->size = vmcs_read32(GUEST_GDTR_LIMIT);
5600 dt->address = vmcs_readl(GUEST_GDTR_BASE);
5601}
5602
5603static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
5604{
5605 vmcs_write32(GUEST_GDTR_LIMIT, dt->size);
5606 vmcs_writel(GUEST_GDTR_BASE, dt->address);
5607}
5608
5609static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg)
5610{
5611 struct kvm_segment var;
5612 u32 ar;
5613
5614 vmx_get_segment(vcpu, &var, seg);
5615 var.dpl = 0x3;
5616 if (seg == VCPU_SREG_CS)
5617 var.type = 0x3;
5618 ar = vmx_segment_access_rights(&var);
5619
5620 if (var.base != (var.selector << 4))
5621 return false;
5622 if (var.limit != 0xffff)
5623 return false;
5624 if (ar != 0xf3)
5625 return false;
5626
5627 return true;
5628}
5629
5630static bool code_segment_valid(struct kvm_vcpu *vcpu)
5631{
5632 struct kvm_segment cs;
5633 unsigned int cs_rpl;
5634
5635 vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
5636 cs_rpl = cs.selector & SEGMENT_RPL_MASK;
5637
5638 if (cs.unusable)
5639 return false;
5640 if (~cs.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_ACCESSES_MASK))
5641 return false;
5642 if (!cs.s)
5643 return false;
5644 if (cs.type & VMX_AR_TYPE_WRITEABLE_MASK) {
5645 if (cs.dpl > cs_rpl)
5646 return false;
5647 } else {
5648 if (cs.dpl != cs_rpl)
5649 return false;
5650 }
5651 if (!cs.present)
5652 return false;
5653
5654 /* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */
5655 return true;
5656}
5657
5658static bool stack_segment_valid(struct kvm_vcpu *vcpu)
5659{
5660 struct kvm_segment ss;
5661 unsigned int ss_rpl;
5662
5663 vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
5664 ss_rpl = ss.selector & SEGMENT_RPL_MASK;
5665
5666 if (ss.unusable)
5667 return true;
5668 if (ss.type != 3 && ss.type != 7)
5669 return false;
5670 if (!ss.s)
5671 return false;
5672 if (ss.dpl != ss_rpl) /* DPL != RPL */
5673 return false;
5674 if (!ss.present)
5675 return false;
5676
5677 return true;
5678}
5679
5680static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg)
5681{
5682 struct kvm_segment var;
5683 unsigned int rpl;
5684
5685 vmx_get_segment(vcpu, &var, seg);
5686 rpl = var.selector & SEGMENT_RPL_MASK;
5687
5688 if (var.unusable)
5689 return true;
5690 if (!var.s)
5691 return false;
5692 if (!var.present)
5693 return false;
5694 if (~var.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_WRITEABLE_MASK)) {
5695 if (var.dpl < rpl) /* DPL < RPL */
5696 return false;
5697 }
5698
5699 /* TODO: Add other members to kvm_segment_field to allow checking for other access
5700 * rights flags
5701 */
5702 return true;
5703}
5704
5705static bool tr_valid(struct kvm_vcpu *vcpu)
5706{
5707 struct kvm_segment tr;
5708
5709 vmx_get_segment(vcpu, &tr, VCPU_SREG_TR);
5710
5711 if (tr.unusable)
5712 return false;
5713 if (tr.selector & SEGMENT_TI_MASK) /* TI = 1 */
5714 return false;
5715 if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */
5716 return false;
5717 if (!tr.present)
5718 return false;
5719
5720 return true;
5721}
5722
5723static bool ldtr_valid(struct kvm_vcpu *vcpu)
5724{
5725 struct kvm_segment ldtr;
5726
5727 vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR);
5728
5729 if (ldtr.unusable)
5730 return true;
5731 if (ldtr.selector & SEGMENT_TI_MASK) /* TI = 1 */
5732 return false;
5733 if (ldtr.type != 2)
5734 return false;
5735 if (!ldtr.present)
5736 return false;
5737
5738 return true;
5739}
5740
5741static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu)
5742{
5743 struct kvm_segment cs, ss;
5744
5745 vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
5746 vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
5747
5748 return ((cs.selector & SEGMENT_RPL_MASK) ==
5749 (ss.selector & SEGMENT_RPL_MASK));
5750}
5751
5752/*
5753 * Check if guest state is valid. Returns true if valid, false if
5754 * not.
5755 * We assume that registers are always usable
5756 */
5757static bool guest_state_valid(struct kvm_vcpu *vcpu)
5758{
5759 if (enable_unrestricted_guest)
5760 return true;
5761
5762 /* real mode guest state checks */
5763 if (!is_protmode(vcpu) || (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) {
5764 if (!rmode_segment_valid(vcpu, VCPU_SREG_CS))
5765 return false;
5766 if (!rmode_segment_valid(vcpu, VCPU_SREG_SS))
5767 return false;
5768 if (!rmode_segment_valid(vcpu, VCPU_SREG_DS))
5769 return false;
5770 if (!rmode_segment_valid(vcpu, VCPU_SREG_ES))
5771 return false;
5772 if (!rmode_segment_valid(vcpu, VCPU_SREG_FS))
5773 return false;
5774 if (!rmode_segment_valid(vcpu, VCPU_SREG_GS))
5775 return false;
5776 } else {
5777 /* protected mode guest state checks */
5778 if (!cs_ss_rpl_check(vcpu))
5779 return false;
5780 if (!code_segment_valid(vcpu))
5781 return false;
5782 if (!stack_segment_valid(vcpu))
5783 return false;
5784 if (!data_segment_valid(vcpu, VCPU_SREG_DS))
5785 return false;
5786 if (!data_segment_valid(vcpu, VCPU_SREG_ES))
5787 return false;
5788 if (!data_segment_valid(vcpu, VCPU_SREG_FS))
5789 return false;
5790 if (!data_segment_valid(vcpu, VCPU_SREG_GS))
5791 return false;
5792 if (!tr_valid(vcpu))
5793 return false;
5794 if (!ldtr_valid(vcpu))
5795 return false;
5796 }
5797 /* TODO:
5798 * - Add checks on RIP
5799 * - Add checks on RFLAGS
5800 */
5801
5802 return true;
5803}
5804
5805static bool page_address_valid(struct kvm_vcpu *vcpu, gpa_t gpa)
5806{
5807 return PAGE_ALIGNED(gpa) && !(gpa >> cpuid_maxphyaddr(vcpu));
5808}
5809
5810static int init_rmode_tss(struct kvm *kvm)
5811{
5812 gfn_t fn;
5813 u16 data = 0;
5814 int idx, r;
5815
5816 idx = srcu_read_lock(&kvm->srcu);
5817 fn = to_kvm_vmx(kvm)->tss_addr >> PAGE_SHIFT;
5818 r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
5819 if (r < 0)
5820 goto out;
5821 data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
5822 r = kvm_write_guest_page(kvm, fn++, &data,
5823 TSS_IOPB_BASE_OFFSET, sizeof(u16));
5824 if (r < 0)
5825 goto out;
5826 r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE);
5827 if (r < 0)
5828 goto out;
5829 r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
5830 if (r < 0)
5831 goto out;
5832 data = ~0;
5833 r = kvm_write_guest_page(kvm, fn, &data,
5834 RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1,
5835 sizeof(u8));
5836out:
5837 srcu_read_unlock(&kvm->srcu, idx);
5838 return r;
5839}
5840
5841static int init_rmode_identity_map(struct kvm *kvm)
5842{
5843 struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
5844 int i, idx, r = 0;
5845 kvm_pfn_t identity_map_pfn;
5846 u32 tmp;
5847
5848 /* Protect kvm_vmx->ept_identity_pagetable_done. */
5849 mutex_lock(&kvm->slots_lock);
5850
5851 if (likely(kvm_vmx->ept_identity_pagetable_done))
5852 goto out2;
5853
5854 if (!kvm_vmx->ept_identity_map_addr)
5855 kvm_vmx->ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR;
5856 identity_map_pfn = kvm_vmx->ept_identity_map_addr >> PAGE_SHIFT;
5857
5858 r = __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT,
5859 kvm_vmx->ept_identity_map_addr, PAGE_SIZE);
5860 if (r < 0)
5861 goto out2;
5862
5863 idx = srcu_read_lock(&kvm->srcu);
5864 r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE);
5865 if (r < 0)
5866 goto out;
5867 /* Set up identity-mapping pagetable for EPT in real mode */
5868 for (i = 0; i < PT32_ENT_PER_PAGE; i++) {
5869 tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |
5870 _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
5871 r = kvm_write_guest_page(kvm, identity_map_pfn,
5872 &tmp, i * sizeof(tmp), sizeof(tmp));
5873 if (r < 0)
5874 goto out;
5875 }
5876 kvm_vmx->ept_identity_pagetable_done = true;
5877
5878out:
5879 srcu_read_unlock(&kvm->srcu, idx);
5880
5881out2:
5882 mutex_unlock(&kvm->slots_lock);
5883 return r;
5884}
5885
5886static void seg_setup(int seg)
5887{
5888 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
5889 unsigned int ar;
5890
5891 vmcs_write16(sf->selector, 0);
5892 vmcs_writel(sf->base, 0);
5893 vmcs_write32(sf->limit, 0xffff);
5894 ar = 0x93;
5895 if (seg == VCPU_SREG_CS)
5896 ar |= 0x08; /* code segment */
5897
5898 vmcs_write32(sf->ar_bytes, ar);
5899}
5900
5901static int alloc_apic_access_page(struct kvm *kvm)
5902{
5903 struct page *page;
5904 int r = 0;
5905
5906 mutex_lock(&kvm->slots_lock);
5907 if (kvm->arch.apic_access_page_done)
5908 goto out;
5909 r = __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
5910 APIC_DEFAULT_PHYS_BASE, PAGE_SIZE);
5911 if (r)
5912 goto out;
5913
5914 page = gfn_to_page(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
5915 if (is_error_page(page)) {
5916 r = -EFAULT;
5917 goto out;
5918 }
5919
5920 /*
5921 * Do not pin the page in memory, so that memory hot-unplug
5922 * is able to migrate it.
5923 */
5924 put_page(page);
5925 kvm->arch.apic_access_page_done = true;
5926out:
5927 mutex_unlock(&kvm->slots_lock);
5928 return r;
5929}
5930
5931static int allocate_vpid(void)
5932{
5933 int vpid;
5934
5935 if (!enable_vpid)
5936 return 0;
5937 spin_lock(&vmx_vpid_lock);
5938 vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS);
5939 if (vpid < VMX_NR_VPIDS)
5940 __set_bit(vpid, vmx_vpid_bitmap);
5941 else
5942 vpid = 0;
5943 spin_unlock(&vmx_vpid_lock);
5944 return vpid;
5945}
5946
5947static void free_vpid(int vpid)
5948{
5949 if (!enable_vpid || vpid == 0)
5950 return;
5951 spin_lock(&vmx_vpid_lock);
5952 __clear_bit(vpid, vmx_vpid_bitmap);
5953 spin_unlock(&vmx_vpid_lock);
5954}
5955
5956static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
5957 u32 msr, int type)
5958{
5959 int f = sizeof(unsigned long);
5960
5961 if (!cpu_has_vmx_msr_bitmap())
5962 return;
5963
5964 if (static_branch_unlikely(&enable_evmcs))
5965 evmcs_touch_msr_bitmap();
5966
5967 /*
5968 * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
5969 * have the write-low and read-high bitmap offsets the wrong way round.
5970 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
5971 */
5972 if (msr <= 0x1fff) {
5973 if (type & MSR_TYPE_R)
5974 /* read-low */
5975 __clear_bit(msr, msr_bitmap + 0x000 / f);
5976
5977 if (type & MSR_TYPE_W)
5978 /* write-low */
5979 __clear_bit(msr, msr_bitmap + 0x800 / f);
5980
5981 } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
5982 msr &= 0x1fff;
5983 if (type & MSR_TYPE_R)
5984 /* read-high */
5985 __clear_bit(msr, msr_bitmap + 0x400 / f);
5986
5987 if (type & MSR_TYPE_W)
5988 /* write-high */
5989 __clear_bit(msr, msr_bitmap + 0xc00 / f);
5990
5991 }
5992}
5993
5994static __always_inline void vmx_enable_intercept_for_msr(unsigned long *msr_bitmap,
5995 u32 msr, int type)
5996{
5997 int f = sizeof(unsigned long);
5998
5999 if (!cpu_has_vmx_msr_bitmap())
6000 return;
6001
6002 if (static_branch_unlikely(&enable_evmcs))
6003 evmcs_touch_msr_bitmap();
6004
6005 /*
6006 * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
6007 * have the write-low and read-high bitmap offsets the wrong way round.
6008 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
6009 */
6010 if (msr <= 0x1fff) {
6011 if (type & MSR_TYPE_R)
6012 /* read-low */
6013 __set_bit(msr, msr_bitmap + 0x000 / f);
6014
6015 if (type & MSR_TYPE_W)
6016 /* write-low */
6017 __set_bit(msr, msr_bitmap + 0x800 / f);
6018
6019 } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
6020 msr &= 0x1fff;
6021 if (type & MSR_TYPE_R)
6022 /* read-high */
6023 __set_bit(msr, msr_bitmap + 0x400 / f);
6024
6025 if (type & MSR_TYPE_W)
6026 /* write-high */
6027 __set_bit(msr, msr_bitmap + 0xc00 / f);
6028
6029 }
6030}
6031
6032static __always_inline void vmx_set_intercept_for_msr(unsigned long *msr_bitmap,
6033 u32 msr, int type, bool value)
6034{
6035 if (value)
6036 vmx_enable_intercept_for_msr(msr_bitmap, msr, type);
6037 else
6038 vmx_disable_intercept_for_msr(msr_bitmap, msr, type);
6039}
6040
6041/*
6042 * If a msr is allowed by L0, we should check whether it is allowed by L1.
6043 * The corresponding bit will be cleared unless both of L0 and L1 allow it.
6044 */
6045static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1,
6046 unsigned long *msr_bitmap_nested,
6047 u32 msr, int type)
6048{
6049 int f = sizeof(unsigned long);
6050
6051 /*
6052 * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
6053 * have the write-low and read-high bitmap offsets the wrong way round.
6054 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
6055 */
6056 if (msr <= 0x1fff) {
6057 if (type & MSR_TYPE_R &&
6058 !test_bit(msr, msr_bitmap_l1 + 0x000 / f))
6059 /* read-low */
6060 __clear_bit(msr, msr_bitmap_nested + 0x000 / f);
6061
6062 if (type & MSR_TYPE_W &&
6063 !test_bit(msr, msr_bitmap_l1 + 0x800 / f))
6064 /* write-low */
6065 __clear_bit(msr, msr_bitmap_nested + 0x800 / f);
6066
6067 } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
6068 msr &= 0x1fff;
6069 if (type & MSR_TYPE_R &&
6070 !test_bit(msr, msr_bitmap_l1 + 0x400 / f))
6071 /* read-high */
6072 __clear_bit(msr, msr_bitmap_nested + 0x400 / f);
6073
6074 if (type & MSR_TYPE_W &&
6075 !test_bit(msr, msr_bitmap_l1 + 0xc00 / f))
6076 /* write-high */
6077 __clear_bit(msr, msr_bitmap_nested + 0xc00 / f);
6078
6079 }
6080}
6081
6082static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu)
6083{
6084 u8 mode = 0;
6085
6086 if (cpu_has_secondary_exec_ctrls() &&
6087 (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) &
6088 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
6089 mode |= MSR_BITMAP_MODE_X2APIC;
6090 if (enable_apicv && kvm_vcpu_apicv_active(vcpu))
6091 mode |= MSR_BITMAP_MODE_X2APIC_APICV;
6092 }
6093
6094 return mode;
6095}
6096
6097#define X2APIC_MSR(r) (APIC_BASE_MSR + ((r) >> 4))
6098
6099static void vmx_update_msr_bitmap_x2apic(unsigned long *msr_bitmap,
6100 u8 mode)
6101{
6102 int msr;
6103
6104 for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
6105 unsigned word = msr / BITS_PER_LONG;
6106 msr_bitmap[word] = (mode & MSR_BITMAP_MODE_X2APIC_APICV) ? 0 : ~0;
6107 msr_bitmap[word + (0x800 / sizeof(long))] = ~0;
6108 }
6109
6110 if (mode & MSR_BITMAP_MODE_X2APIC) {
6111 /*
6112 * TPR reads and writes can be virtualized even if virtual interrupt
6113 * delivery is not in use.
6114 */
6115 vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW);
6116 if (mode & MSR_BITMAP_MODE_X2APIC_APICV) {
6117 vmx_enable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_R);
6118 vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_EOI), MSR_TYPE_W);
6119 vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W);
6120 }
6121 }
6122}
6123
6124static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu)
6125{
6126 struct vcpu_vmx *vmx = to_vmx(vcpu);
6127 unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
6128 u8 mode = vmx_msr_bitmap_mode(vcpu);
6129 u8 changed = mode ^ vmx->msr_bitmap_mode;
6130
6131 if (!changed)
6132 return;
6133
6134 if (changed & (MSR_BITMAP_MODE_X2APIC | MSR_BITMAP_MODE_X2APIC_APICV))
6135 vmx_update_msr_bitmap_x2apic(msr_bitmap, mode);
6136
6137 vmx->msr_bitmap_mode = mode;
6138}
6139
6140static bool vmx_get_enable_apicv(struct kvm_vcpu *vcpu)
6141{
6142 return enable_apicv;
6143}
6144
6145static void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu)
6146{
6147 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
6148 gfn_t gfn;
6149
6150 /*
6151 * Don't need to mark the APIC access page dirty; it is never
6152 * written to by the CPU during APIC virtualization.
6153 */
6154
6155 if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
6156 gfn = vmcs12->virtual_apic_page_addr >> PAGE_SHIFT;
6157 kvm_vcpu_mark_page_dirty(vcpu, gfn);
6158 }
6159
6160 if (nested_cpu_has_posted_intr(vmcs12)) {
6161 gfn = vmcs12->posted_intr_desc_addr >> PAGE_SHIFT;
6162 kvm_vcpu_mark_page_dirty(vcpu, gfn);
6163 }
6164}
6165
6166
6167static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
6168{
6169 struct vcpu_vmx *vmx = to_vmx(vcpu);
6170 int max_irr;
6171 void *vapic_page;
6172 u16 status;
6173
6174 if (!vmx->nested.pi_desc || !vmx->nested.pi_pending)
6175 return;
6176
6177 vmx->nested.pi_pending = false;
6178 if (!pi_test_and_clear_on(vmx->nested.pi_desc))
6179 return;
6180
6181 max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256);
6182 if (max_irr != 256) {
6183 vapic_page = kmap(vmx->nested.virtual_apic_page);
6184 __kvm_apic_update_irr(vmx->nested.pi_desc->pir,
6185 vapic_page, &max_irr);
6186 kunmap(vmx->nested.virtual_apic_page);
6187
6188 status = vmcs_read16(GUEST_INTR_STATUS);
6189 if ((u8)max_irr > ((u8)status & 0xff)) {
6190 status &= ~0xff;
6191 status |= (u8)max_irr;
6192 vmcs_write16(GUEST_INTR_STATUS, status);
6193 }
6194 }
6195
6196 nested_mark_vmcs12_pages_dirty(vcpu);
6197}
6198
6199static u8 vmx_get_rvi(void)
6200{
6201 return vmcs_read16(GUEST_INTR_STATUS) & 0xff;
6202}
6203
6204static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
6205{
6206 struct vcpu_vmx *vmx = to_vmx(vcpu);
6207 void *vapic_page;
6208 u32 vppr;
6209 int rvi;
6210
6211 if (WARN_ON_ONCE(!is_guest_mode(vcpu)) ||
6212 !nested_cpu_has_vid(get_vmcs12(vcpu)) ||
6213 WARN_ON_ONCE(!vmx->nested.virtual_apic_page))
6214 return false;
6215
6216 rvi = vmx_get_rvi();
6217
6218 vapic_page = kmap(vmx->nested.virtual_apic_page);
6219 vppr = *((u32 *)(vapic_page + APIC_PROCPRI));
6220 kunmap(vmx->nested.virtual_apic_page);
6221
6222 return ((rvi & 0xf0) > (vppr & 0xf0));
6223}
6224
6225static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu,
6226 bool nested)
6227{
6228#ifdef CONFIG_SMP
6229 int pi_vec = nested ? POSTED_INTR_NESTED_VECTOR : POSTED_INTR_VECTOR;
6230
6231 if (vcpu->mode == IN_GUEST_MODE) {
6232 /*
6233 * The vector of interrupt to be delivered to vcpu had
6234 * been set in PIR before this function.
6235 *
6236 * Following cases will be reached in this block, and
6237 * we always send a notification event in all cases as
6238 * explained below.
6239 *
6240 * Case 1: vcpu keeps in non-root mode. Sending a
6241 * notification event posts the interrupt to vcpu.
6242 *
6243 * Case 2: vcpu exits to root mode and is still
6244 * runnable. PIR will be synced to vIRR before the
6245 * next vcpu entry. Sending a notification event in
6246 * this case has no effect, as vcpu is not in root
6247 * mode.
6248 *
6249 * Case 3: vcpu exits to root mode and is blocked.
6250 * vcpu_block() has already synced PIR to vIRR and
6251 * never blocks vcpu if vIRR is not cleared. Therefore,
6252 * a blocked vcpu here does not wait for any requested
6253 * interrupts in PIR, and sending a notification event
6254 * which has no effect is safe here.
6255 */
6256
6257 apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec);
6258 return true;
6259 }
6260#endif
6261 return false;
6262}
6263
6264static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
6265 int vector)
6266{
6267 struct vcpu_vmx *vmx = to_vmx(vcpu);
6268
6269 if (is_guest_mode(vcpu) &&
6270 vector == vmx->nested.posted_intr_nv) {
6271 /*
6272 * If a posted intr is not recognized by hardware,
6273 * we will accomplish it in the next vmentry.
6274 */
6275 vmx->nested.pi_pending = true;
6276 kvm_make_request(KVM_REQ_EVENT, vcpu);
6277 /* the PIR and ON have been set by L1. */
6278 if (!kvm_vcpu_trigger_posted_interrupt(vcpu, true))
6279 kvm_vcpu_kick(vcpu);
6280 return 0;
6281 }
6282 return -1;
6283}
6284/*
6285 * Send interrupt to vcpu via posted interrupt way.
6286 * 1. If target vcpu is running(non-root mode), send posted interrupt
6287 * notification to vcpu and hardware will sync PIR to vIRR atomically.
6288 * 2. If target vcpu isn't running(root mode), kick it to pick up the
6289 * interrupt from PIR in next vmentry.
6290 */
6291static void vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
6292{
6293 struct vcpu_vmx *vmx = to_vmx(vcpu);
6294 int r;
6295
6296 r = vmx_deliver_nested_posted_interrupt(vcpu, vector);
6297 if (!r)
6298 return;
6299
6300 if (pi_test_and_set_pir(vector, &vmx->pi_desc))
6301 return;
6302
6303 /* If a previous notification has sent the IPI, nothing to do. */
6304 if (pi_test_and_set_on(&vmx->pi_desc))
6305 return;
6306
6307 if (!kvm_vcpu_trigger_posted_interrupt(vcpu, false))
6308 kvm_vcpu_kick(vcpu);
6309}
6310
6311/*
6312 * Set up the vmcs's constant host-state fields, i.e., host-state fields that
6313 * will not change in the lifetime of the guest.
6314 * Note that host-state that does change is set elsewhere. E.g., host-state
6315 * that is set differently for each CPU is set in vmx_vcpu_load(), not here.
6316 */
6317static void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
6318{
6319 u32 low32, high32;
6320 unsigned long tmpl;
6321 struct desc_ptr dt;
6322 unsigned long cr0, cr3, cr4;
6323
6324 cr0 = read_cr0();
6325 WARN_ON(cr0 & X86_CR0_TS);
6326 vmcs_writel(HOST_CR0, cr0); /* 22.2.3 */
6327
6328 /*
6329 * Save the most likely value for this task's CR3 in the VMCS.
6330 * We can't use __get_current_cr3_fast() because we're not atomic.
6331 */
6332 cr3 = __read_cr3();
6333 vmcs_writel(HOST_CR3, cr3); /* 22.2.3 FIXME: shadow tables */
6334 vmx->loaded_vmcs->host_state.cr3 = cr3;
6335
6336 /* Save the most likely value for this task's CR4 in the VMCS. */
6337 cr4 = cr4_read_shadow();
6338 vmcs_writel(HOST_CR4, cr4); /* 22.2.3, 22.2.5 */
6339 vmx->loaded_vmcs->host_state.cr4 = cr4;
6340
6341 vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */
6342#ifdef CONFIG_X86_64
6343 /*
6344 * Load null selectors, so we can avoid reloading them in
6345 * vmx_prepare_switch_to_host(), in case userspace uses
6346 * the null selectors too (the expected case).
6347 */
6348 vmcs_write16(HOST_DS_SELECTOR, 0);
6349 vmcs_write16(HOST_ES_SELECTOR, 0);
6350#else
6351 vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
6352 vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */
6353#endif
6354 vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
6355 vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */
6356
6357 store_idt(&dt);
6358 vmcs_writel(HOST_IDTR_BASE, dt.address); /* 22.2.4 */
6359 vmx->host_idt_base = dt.address;
6360
6361 vmcs_writel(HOST_RIP, vmx_return); /* 22.2.5 */
6362
6363 rdmsr(MSR_IA32_SYSENTER_CS, low32, high32);
6364 vmcs_write32(HOST_IA32_SYSENTER_CS, low32);
6365 rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl);
6366 vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl); /* 22.2.3 */
6367
6368 if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) {
6369 rdmsr(MSR_IA32_CR_PAT, low32, high32);
6370 vmcs_write64(HOST_IA32_PAT, low32 | ((u64) high32 << 32));
6371 }
6372
6373 if (cpu_has_load_ia32_efer)
6374 vmcs_write64(HOST_IA32_EFER, host_efer);
6375}
6376
6377static void set_cr4_guest_host_mask(struct vcpu_vmx *vmx)
6378{
6379 vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS;
6380 if (enable_ept)
6381 vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE;
6382 if (is_guest_mode(&vmx->vcpu))
6383 vmx->vcpu.arch.cr4_guest_owned_bits &=
6384 ~get_vmcs12(&vmx->vcpu)->cr4_guest_host_mask;
6385 vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits);
6386}
6387
6388static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
6389{
6390 u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl;
6391
6392 if (!kvm_vcpu_apicv_active(&vmx->vcpu))
6393 pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;
6394
6395 if (!enable_vnmi)
6396 pin_based_exec_ctrl &= ~PIN_BASED_VIRTUAL_NMIS;
6397
6398 /* Enable the preemption timer dynamically */
6399 pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
6400 return pin_based_exec_ctrl;
6401}
6402
6403static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
6404{
6405 struct vcpu_vmx *vmx = to_vmx(vcpu);
6406
6407 vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx));
6408 if (cpu_has_secondary_exec_ctrls()) {
6409 if (kvm_vcpu_apicv_active(vcpu))
6410 vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
6411 SECONDARY_EXEC_APIC_REGISTER_VIRT |
6412 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
6413 else
6414 vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL,
6415 SECONDARY_EXEC_APIC_REGISTER_VIRT |
6416 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
6417 }
6418
6419 if (cpu_has_vmx_msr_bitmap())
6420 vmx_update_msr_bitmap(vcpu);
6421}
6422
6423static u32 vmx_exec_control(struct vcpu_vmx *vmx)
6424{
6425 u32 exec_control = vmcs_config.cpu_based_exec_ctrl;
6426
6427 if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)
6428 exec_control &= ~CPU_BASED_MOV_DR_EXITING;
6429
6430 if (!cpu_need_tpr_shadow(&vmx->vcpu)) {
6431 exec_control &= ~CPU_BASED_TPR_SHADOW;
6432#ifdef CONFIG_X86_64
6433 exec_control |= CPU_BASED_CR8_STORE_EXITING |
6434 CPU_BASED_CR8_LOAD_EXITING;
6435#endif
6436 }
6437 if (!enable_ept)
6438 exec_control |= CPU_BASED_CR3_STORE_EXITING |
6439 CPU_BASED_CR3_LOAD_EXITING |
6440 CPU_BASED_INVLPG_EXITING;
6441 if (kvm_mwait_in_guest(vmx->vcpu.kvm))
6442 exec_control &= ~(CPU_BASED_MWAIT_EXITING |
6443 CPU_BASED_MONITOR_EXITING);
6444 if (kvm_hlt_in_guest(vmx->vcpu.kvm))
6445 exec_control &= ~CPU_BASED_HLT_EXITING;
6446 return exec_control;
6447}
6448
6449static bool vmx_rdrand_supported(void)
6450{
6451 return vmcs_config.cpu_based_2nd_exec_ctrl &
6452 SECONDARY_EXEC_RDRAND_EXITING;
6453}
6454
6455static bool vmx_rdseed_supported(void)
6456{
6457 return vmcs_config.cpu_based_2nd_exec_ctrl &
6458 SECONDARY_EXEC_RDSEED_EXITING;
6459}
6460
6461static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
6462{
6463 struct kvm_vcpu *vcpu = &vmx->vcpu;
6464
6465 u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
6466
6467 if (!cpu_need_virtualize_apic_accesses(vcpu))
6468 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
6469 if (vmx->vpid == 0)
6470 exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
6471 if (!enable_ept) {
6472 exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
6473 enable_unrestricted_guest = 0;
6474 }
6475 if (!enable_unrestricted_guest)
6476 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
6477 if (kvm_pause_in_guest(vmx->vcpu.kvm))
6478 exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
6479 if (!kvm_vcpu_apicv_active(vcpu))
6480 exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT |
6481 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
6482 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
6483
6484 /* SECONDARY_EXEC_DESC is enabled/disabled on writes to CR4.UMIP,
6485 * in vmx_set_cr4. */
6486 exec_control &= ~SECONDARY_EXEC_DESC;
6487
6488 /* SECONDARY_EXEC_SHADOW_VMCS is enabled when L1 executes VMPTRLD
6489 (handle_vmptrld).
6490 We can NOT enable shadow_vmcs here because we don't have yet
6491 a current VMCS12
6492 */
6493 exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
6494
6495 if (!enable_pml)
6496 exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
6497
6498 if (vmx_xsaves_supported()) {
6499 /* Exposing XSAVES only when XSAVE is exposed */
6500 bool xsaves_enabled =
6501 guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
6502 guest_cpuid_has(vcpu, X86_FEATURE_XSAVES);
6503
6504 if (!xsaves_enabled)
6505 exec_control &= ~SECONDARY_EXEC_XSAVES;
6506
6507 if (nested) {
6508 if (xsaves_enabled)
6509 vmx->nested.msrs.secondary_ctls_high |=
6510 SECONDARY_EXEC_XSAVES;
6511 else
6512 vmx->nested.msrs.secondary_ctls_high &=
6513 ~SECONDARY_EXEC_XSAVES;
6514 }
6515 }
6516
6517 if (vmx_rdtscp_supported()) {
6518 bool rdtscp_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP);
6519 if (!rdtscp_enabled)
6520 exec_control &= ~SECONDARY_EXEC_RDTSCP;
6521
6522 if (nested) {
6523 if (rdtscp_enabled)
6524 vmx->nested.msrs.secondary_ctls_high |=
6525 SECONDARY_EXEC_RDTSCP;
6526 else
6527 vmx->nested.msrs.secondary_ctls_high &=
6528 ~SECONDARY_EXEC_RDTSCP;
6529 }
6530 }
6531
6532 if (vmx_invpcid_supported()) {
6533 /* Exposing INVPCID only when PCID is exposed */
6534 bool invpcid_enabled =
6535 guest_cpuid_has(vcpu, X86_FEATURE_INVPCID) &&
6536 guest_cpuid_has(vcpu, X86_FEATURE_PCID);
6537
6538 if (!invpcid_enabled) {
6539 exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID;
6540 guest_cpuid_clear(vcpu, X86_FEATURE_INVPCID);
6541 }
6542
6543 if (nested) {
6544 if (invpcid_enabled)
6545 vmx->nested.msrs.secondary_ctls_high |=
6546 SECONDARY_EXEC_ENABLE_INVPCID;
6547 else
6548 vmx->nested.msrs.secondary_ctls_high &=
6549 ~SECONDARY_EXEC_ENABLE_INVPCID;
6550 }
6551 }
6552
6553 if (vmx_rdrand_supported()) {
6554 bool rdrand_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDRAND);
6555 if (rdrand_enabled)
6556 exec_control &= ~SECONDARY_EXEC_RDRAND_EXITING;
6557
6558 if (nested) {
6559 if (rdrand_enabled)
6560 vmx->nested.msrs.secondary_ctls_high |=
6561 SECONDARY_EXEC_RDRAND_EXITING;
6562 else
6563 vmx->nested.msrs.secondary_ctls_high &=
6564 ~SECONDARY_EXEC_RDRAND_EXITING;
6565 }
6566 }
6567
6568 if (vmx_rdseed_supported()) {
6569 bool rdseed_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDSEED);
6570 if (rdseed_enabled)
6571 exec_control &= ~SECONDARY_EXEC_RDSEED_EXITING;
6572
6573 if (nested) {
6574 if (rdseed_enabled)
6575 vmx->nested.msrs.secondary_ctls_high |=
6576 SECONDARY_EXEC_RDSEED_EXITING;
6577 else
6578 vmx->nested.msrs.secondary_ctls_high &=
6579 ~SECONDARY_EXEC_RDSEED_EXITING;
6580 }
6581 }
6582
6583 vmx->secondary_exec_control = exec_control;
6584}
6585
6586static void ept_set_mmio_spte_mask(void)
6587{
6588 /*
6589 * EPT Misconfigurations can be generated if the value of bits 2:0
6590 * of an EPT paging-structure entry is 110b (write/execute).
6591 */
6592 kvm_mmu_set_mmio_spte_mask(VMX_EPT_RWX_MASK,
6593 VMX_EPT_MISCONFIG_WX_VALUE);
6594}
6595
6596#define VMX_XSS_EXIT_BITMAP 0
6597/*
6598 * Sets up the vmcs for emulated real mode.
6599 */
6600static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
6601{
6602 int i;
6603
6604 if (enable_shadow_vmcs) {
6605 /*
6606 * At vCPU creation, "VMWRITE to any supported field
6607 * in the VMCS" is supported, so use the more
6608 * permissive vmx_vmread_bitmap to specify both read
6609 * and write permissions for the shadow VMCS.
6610 */
6611 vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap));
6612 vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmread_bitmap));
6613 }
6614 if (cpu_has_vmx_msr_bitmap())
6615 vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap));
6616
6617 vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
6618
6619 /* Control */
6620 vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx));
6621 vmx->hv_deadline_tsc = -1;
6622
6623 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx));
6624
6625 if (cpu_has_secondary_exec_ctrls()) {
6626 vmx_compute_secondary_exec_control(vmx);
6627 vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
6628 vmx->secondary_exec_control);
6629 }
6630
6631 if (kvm_vcpu_apicv_active(&vmx->vcpu)) {
6632 vmcs_write64(EOI_EXIT_BITMAP0, 0);
6633 vmcs_write64(EOI_EXIT_BITMAP1, 0);
6634 vmcs_write64(EOI_EXIT_BITMAP2, 0);
6635 vmcs_write64(EOI_EXIT_BITMAP3, 0);
6636
6637 vmcs_write16(GUEST_INTR_STATUS, 0);
6638
6639 vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR);
6640 vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc)));
6641 }
6642
6643 if (!kvm_pause_in_guest(vmx->vcpu.kvm)) {
6644 vmcs_write32(PLE_GAP, ple_gap);
6645 vmx->ple_window = ple_window;
6646 vmx->ple_window_dirty = true;
6647 }
6648
6649 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
6650 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
6651 vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */
6652
6653 vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */
6654 vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */
6655 vmx_set_constant_host_state(vmx);
6656 vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */
6657 vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */
6658
6659 if (cpu_has_vmx_vmfunc())
6660 vmcs_write64(VM_FUNCTION_CONTROL, 0);
6661
6662 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
6663 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
6664 vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val));
6665 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
6666 vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val));
6667
6668 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
6669 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
6670
6671 for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) {
6672 u32 index = vmx_msr_index[i];
6673 u32 data_low, data_high;
6674 int j = vmx->nmsrs;
6675
6676 if (rdmsr_safe(index, &data_low, &data_high) < 0)
6677 continue;
6678 if (wrmsr_safe(index, data_low, data_high) < 0)
6679 continue;
6680 vmx->guest_msrs[j].index = i;
6681 vmx->guest_msrs[j].data = 0;
6682 vmx->guest_msrs[j].mask = -1ull;
6683 ++vmx->nmsrs;
6684 }
6685
6686 vmx->arch_capabilities = kvm_get_arch_capabilities();
6687
6688 vm_exit_controls_init(vmx, vmcs_config.vmexit_ctrl);
6689
6690 /* 22.2.1, 20.8.1 */
6691 vm_entry_controls_init(vmx, vmcs_config.vmentry_ctrl);
6692
6693 vmx->vcpu.arch.cr0_guest_owned_bits = X86_CR0_TS;
6694 vmcs_writel(CR0_GUEST_HOST_MASK, ~X86_CR0_TS);
6695
6696 set_cr4_guest_host_mask(vmx);
6697
6698 if (vmx_xsaves_supported())
6699 vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP);
6700
6701 if (enable_pml) {
6702 vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
6703 vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
6704 }
6705
6706 if (cpu_has_vmx_encls_vmexit())
6707 vmcs_write64(ENCLS_EXITING_BITMAP, -1ull);
6708}
6709
6710static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
6711{
6712 struct vcpu_vmx *vmx = to_vmx(vcpu);
6713 struct msr_data apic_base_msr;
6714 u64 cr0;
6715
6716 vmx->rmode.vm86_active = 0;
6717 vmx->spec_ctrl = 0;
6718
6719 vcpu->arch.microcode_version = 0x100000000ULL;
6720 vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
6721 kvm_set_cr8(vcpu, 0);
6722
6723 if (!init_event) {
6724 apic_base_msr.data = APIC_DEFAULT_PHYS_BASE |
6725 MSR_IA32_APICBASE_ENABLE;
6726 if (kvm_vcpu_is_reset_bsp(vcpu))
6727 apic_base_msr.data |= MSR_IA32_APICBASE_BSP;
6728 apic_base_msr.host_initiated = true;
6729 kvm_set_apic_base(vcpu, &apic_base_msr);
6730 }
6731
6732 vmx_segment_cache_clear(vmx);
6733
6734 seg_setup(VCPU_SREG_CS);
6735 vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
6736 vmcs_writel(GUEST_CS_BASE, 0xffff0000ul);
6737
6738 seg_setup(VCPU_SREG_DS);
6739 seg_setup(VCPU_SREG_ES);
6740 seg_setup(VCPU_SREG_FS);
6741 seg_setup(VCPU_SREG_GS);
6742 seg_setup(VCPU_SREG_SS);
6743
6744 vmcs_write16(GUEST_TR_SELECTOR, 0);
6745 vmcs_writel(GUEST_TR_BASE, 0);
6746 vmcs_write32(GUEST_TR_LIMIT, 0xffff);
6747 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
6748
6749 vmcs_write16(GUEST_LDTR_SELECTOR, 0);
6750 vmcs_writel(GUEST_LDTR_BASE, 0);
6751 vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
6752 vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
6753
6754 if (!init_event) {
6755 vmcs_write32(GUEST_SYSENTER_CS, 0);
6756 vmcs_writel(GUEST_SYSENTER_ESP, 0);
6757 vmcs_writel(GUEST_SYSENTER_EIP, 0);
6758 vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
6759 }
6760
6761 kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
6762 kvm_rip_write(vcpu, 0xfff0);
6763
6764 vmcs_writel(GUEST_GDTR_BASE, 0);
6765 vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
6766
6767 vmcs_writel(GUEST_IDTR_BASE, 0);
6768 vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
6769
6770 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
6771 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
6772 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 0);
6773 if (kvm_mpx_supported())
6774 vmcs_write64(GUEST_BNDCFGS, 0);
6775
6776 setup_msrs(vmx);
6777
6778 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */
6779
6780 if (cpu_has_vmx_tpr_shadow() && !init_event) {
6781 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
6782 if (cpu_need_tpr_shadow(vcpu))
6783 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
6784 __pa(vcpu->arch.apic->regs));
6785 vmcs_write32(TPR_THRESHOLD, 0);
6786 }
6787
6788 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
6789
6790 if (vmx->vpid != 0)
6791 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
6792
6793 cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
6794 vmx->vcpu.arch.cr0 = cr0;
6795 vmx_set_cr0(vcpu, cr0); /* enter rmode */
6796 vmx_set_cr4(vcpu, 0);
6797 vmx_set_efer(vcpu, 0);
6798
6799 update_exception_bitmap(vcpu);
6800
6801 vpid_sync_context(vmx->vpid);
6802 if (init_event)
6803 vmx_clear_hlt(vcpu);
6804}
6805
6806/*
6807 * In nested virtualization, check if L1 asked to exit on external interrupts.
6808 * For most existing hypervisors, this will always return true.
6809 */
6810static bool nested_exit_on_intr(struct kvm_vcpu *vcpu)
6811{
6812 return get_vmcs12(vcpu)->pin_based_vm_exec_control &
6813 PIN_BASED_EXT_INTR_MASK;
6814}
6815
6816/*
6817 * In nested virtualization, check if L1 has set
6818 * VM_EXIT_ACK_INTR_ON_EXIT
6819 */
6820static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu)
6821{
6822 return get_vmcs12(vcpu)->vm_exit_controls &
6823 VM_EXIT_ACK_INTR_ON_EXIT;
6824}
6825
6826static bool nested_exit_on_nmi(struct kvm_vcpu *vcpu)
6827{
6828 return nested_cpu_has_nmi_exiting(get_vmcs12(vcpu));
6829}
6830
6831static void enable_irq_window(struct kvm_vcpu *vcpu)
6832{
6833 vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL,
6834 CPU_BASED_VIRTUAL_INTR_PENDING);
6835}
6836
6837static void enable_nmi_window(struct kvm_vcpu *vcpu)
6838{
6839 if (!enable_vnmi ||
6840 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
6841 enable_irq_window(vcpu);
6842 return;
6843 }
6844
6845 vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL,
6846 CPU_BASED_VIRTUAL_NMI_PENDING);
6847}
6848
6849static void vmx_inject_irq(struct kvm_vcpu *vcpu)
6850{
6851 struct vcpu_vmx *vmx = to_vmx(vcpu);
6852 uint32_t intr;
6853 int irq = vcpu->arch.interrupt.nr;
6854
6855 trace_kvm_inj_virq(irq);
6856
6857 ++vcpu->stat.irq_injections;
6858 if (vmx->rmode.vm86_active) {
6859 int inc_eip = 0;
6860 if (vcpu->arch.interrupt.soft)
6861 inc_eip = vcpu->arch.event_exit_inst_len;
6862 if (kvm_inject_realmode_interrupt(vcpu, irq, inc_eip) != EMULATE_DONE)
6863 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
6864 return;
6865 }
6866 intr = irq | INTR_INFO_VALID_MASK;
6867 if (vcpu->arch.interrupt.soft) {
6868 intr |= INTR_TYPE_SOFT_INTR;
6869 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
6870 vmx->vcpu.arch.event_exit_inst_len);
6871 } else
6872 intr |= INTR_TYPE_EXT_INTR;
6873 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
6874
6875 vmx_clear_hlt(vcpu);
6876}
6877
6878static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
6879{
6880 struct vcpu_vmx *vmx = to_vmx(vcpu);
6881
6882 if (!enable_vnmi) {
6883 /*
6884 * Tracking the NMI-blocked state in software is built upon
6885 * finding the next open IRQ window. This, in turn, depends on
6886 * well-behaving guests: They have to keep IRQs disabled at
6887 * least as long as the NMI handler runs. Otherwise we may
6888 * cause NMI nesting, maybe breaking the guest. But as this is
6889 * highly unlikely, we can live with the residual risk.
6890 */
6891 vmx->loaded_vmcs->soft_vnmi_blocked = 1;
6892 vmx->loaded_vmcs->vnmi_blocked_time = 0;
6893 }
6894
6895 ++vcpu->stat.nmi_injections;
6896 vmx->loaded_vmcs->nmi_known_unmasked = false;
6897
6898 if (vmx->rmode.vm86_active) {
6899 if (kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0) != EMULATE_DONE)
6900 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
6901 return;
6902 }
6903
6904 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
6905 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
6906
6907 vmx_clear_hlt(vcpu);
6908}
6909
6910static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
6911{
6912 struct vcpu_vmx *vmx = to_vmx(vcpu);
6913 bool masked;
6914
6915 if (!enable_vnmi)
6916 return vmx->loaded_vmcs->soft_vnmi_blocked;
6917 if (vmx->loaded_vmcs->nmi_known_unmasked)
6918 return false;
6919 masked = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI;
6920 vmx->loaded_vmcs->nmi_known_unmasked = !masked;
6921 return masked;
6922}
6923
6924static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
6925{
6926 struct vcpu_vmx *vmx = to_vmx(vcpu);
6927
6928 if (!enable_vnmi) {
6929 if (vmx->loaded_vmcs->soft_vnmi_blocked != masked) {
6930 vmx->loaded_vmcs->soft_vnmi_blocked = masked;
6931 vmx->loaded_vmcs->vnmi_blocked_time = 0;
6932 }
6933 } else {
6934 vmx->loaded_vmcs->nmi_known_unmasked = !masked;
6935 if (masked)
6936 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
6937 GUEST_INTR_STATE_NMI);
6938 else
6939 vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
6940 GUEST_INTR_STATE_NMI);
6941 }
6942}
6943
6944static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
6945{
6946 if (to_vmx(vcpu)->nested.nested_run_pending)
6947 return 0;
6948
6949 if (!enable_vnmi &&
6950 to_vmx(vcpu)->loaded_vmcs->soft_vnmi_blocked)
6951 return 0;
6952
6953 return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
6954 (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI
6955 | GUEST_INTR_STATE_NMI));
6956}
6957
6958static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
6959{
6960 return (!to_vmx(vcpu)->nested.nested_run_pending &&
6961 vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
6962 !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
6963 (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
6964}
6965
6966static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
6967{
6968 int ret;
6969
6970 if (enable_unrestricted_guest)
6971 return 0;
6972
6973 ret = x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, addr,
6974 PAGE_SIZE * 3);
6975 if (ret)
6976 return ret;
6977 to_kvm_vmx(kvm)->tss_addr = addr;
6978 return init_rmode_tss(kvm);
6979}
6980
6981static int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr)
6982{
6983 to_kvm_vmx(kvm)->ept_identity_map_addr = ident_addr;
6984 return 0;
6985}
6986
6987static bool rmode_exception(struct kvm_vcpu *vcpu, int vec)
6988{
6989 switch (vec) {
6990 case BP_VECTOR:
6991 /*
6992 * Update instruction length as we may reinject the exception
6993 * from user space while in guest debugging mode.
6994 */
6995 to_vmx(vcpu)->vcpu.arch.event_exit_inst_len =
6996 vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
6997 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
6998 return false;
6999 /* fall through */
7000 case DB_VECTOR:
7001 if (vcpu->guest_debug &
7002 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
7003 return false;
7004 /* fall through */
7005 case DE_VECTOR:
7006 case OF_VECTOR:
7007 case BR_VECTOR:
7008 case UD_VECTOR:
7009 case DF_VECTOR:
7010 case SS_VECTOR:
7011 case GP_VECTOR:
7012 case MF_VECTOR:
7013 return true;
7014 break;
7015 }
7016 return false;
7017}
7018
7019static int handle_rmode_exception(struct kvm_vcpu *vcpu,
7020 int vec, u32 err_code)
7021{
7022 /*
7023 * Instruction with address size override prefix opcode 0x67
7024 * Cause the #SS fault with 0 error code in VM86 mode.
7025 */
7026 if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) {
7027 if (kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE) {
7028 if (vcpu->arch.halt_request) {
7029 vcpu->arch.halt_request = 0;
7030 return kvm_vcpu_halt(vcpu);
7031 }
7032 return 1;
7033 }
7034 return 0;
7035 }
7036
7037 /*
7038 * Forward all other exceptions that are valid in real mode.
7039 * FIXME: Breaks guest debugging in real mode, needs to be fixed with
7040 * the required debugging infrastructure rework.
7041 */
7042 kvm_queue_exception(vcpu, vec);
7043 return 1;
7044}
7045
7046/*
7047 * Trigger machine check on the host. We assume all the MSRs are already set up
7048 * by the CPU and that we still run on the same CPU as the MCE occurred on.
7049 * We pass a fake environment to the machine check handler because we want
7050 * the guest to be always treated like user space, no matter what context
7051 * it used internally.
7052 */
7053static void kvm_machine_check(void)
7054{
7055#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_64)
7056 struct pt_regs regs = {
7057 .cs = 3, /* Fake ring 3 no matter what the guest ran on */
7058 .flags = X86_EFLAGS_IF,
7059 };
7060
7061 do_machine_check(&regs, 0);
7062#endif
7063}
7064
7065static int handle_machine_check(struct kvm_vcpu *vcpu)
7066{
7067 /* already handled by vcpu_run */
7068 return 1;
7069}
7070
7071static int handle_exception(struct kvm_vcpu *vcpu)
7072{
7073 struct vcpu_vmx *vmx = to_vmx(vcpu);
7074 struct kvm_run *kvm_run = vcpu->run;
7075 u32 intr_info, ex_no, error_code;
7076 unsigned long cr2, rip, dr6;
7077 u32 vect_info;
7078 enum emulation_result er;
7079
7080 vect_info = vmx->idt_vectoring_info;
7081 intr_info = vmx->exit_intr_info;
7082
7083 if (is_machine_check(intr_info))
7084 return handle_machine_check(vcpu);
7085
7086 if (is_nmi(intr_info))
7087 return 1; /* already handled by vmx_vcpu_run() */
7088
7089 if (is_invalid_opcode(intr_info))
7090 return handle_ud(vcpu);
7091
7092 error_code = 0;
7093 if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
7094 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
7095
7096 if (!vmx->rmode.vm86_active && is_gp_fault(intr_info)) {
7097 WARN_ON_ONCE(!enable_vmware_backdoor);
7098 er = kvm_emulate_instruction(vcpu,
7099 EMULTYPE_VMWARE | EMULTYPE_NO_UD_ON_FAIL);
7100 if (er == EMULATE_USER_EXIT)
7101 return 0;
7102 else if (er != EMULATE_DONE)
7103 kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
7104 return 1;
7105 }
7106
7107 /*
7108 * The #PF with PFEC.RSVD = 1 indicates the guest is accessing
7109 * MMIO, it is better to report an internal error.
7110 * See the comments in vmx_handle_exit.
7111 */
7112 if ((vect_info & VECTORING_INFO_VALID_MASK) &&
7113 !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) {
7114 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
7115 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX;
7116 vcpu->run->internal.ndata = 3;
7117 vcpu->run->internal.data[0] = vect_info;
7118 vcpu->run->internal.data[1] = intr_info;
7119 vcpu->run->internal.data[2] = error_code;
7120 return 0;
7121 }
7122
7123 if (is_page_fault(intr_info)) {
7124 cr2 = vmcs_readl(EXIT_QUALIFICATION);
7125 /* EPT won't cause page fault directly */
7126 WARN_ON_ONCE(!vcpu->arch.apf.host_apf_reason && enable_ept);
7127 return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0);
7128 }
7129
7130 ex_no = intr_info & INTR_INFO_VECTOR_MASK;
7131
7132 if (vmx->rmode.vm86_active && rmode_exception(vcpu, ex_no))
7133 return handle_rmode_exception(vcpu, ex_no, error_code);
7134
7135 switch (ex_no) {
7136 case AC_VECTOR:
7137 kvm_queue_exception_e(vcpu, AC_VECTOR, error_code);
7138 return 1;
7139 case DB_VECTOR:
7140 dr6 = vmcs_readl(EXIT_QUALIFICATION);
7141 if (!(vcpu->guest_debug &
7142 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) {
7143 vcpu->arch.dr6 &= ~15;
7144 vcpu->arch.dr6 |= dr6 | DR6_RTM;
7145 if (is_icebp(intr_info))
7146 skip_emulated_instruction(vcpu);
7147
7148 kvm_queue_exception(vcpu, DB_VECTOR);
7149 return 1;
7150 }
7151 kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1;
7152 kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7);
7153 /* fall through */
7154 case BP_VECTOR:
7155 /*
7156 * Update instruction length as we may reinject #BP from
7157 * user space while in guest debugging mode. Reading it for
7158 * #DB as well causes no harm, it is not used in that case.
7159 */
7160 vmx->vcpu.arch.event_exit_inst_len =
7161 vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
7162 kvm_run->exit_reason = KVM_EXIT_DEBUG;
7163 rip = kvm_rip_read(vcpu);
7164 kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip;
7165 kvm_run->debug.arch.exception = ex_no;
7166 break;
7167 default:
7168 kvm_run->exit_reason = KVM_EXIT_EXCEPTION;
7169 kvm_run->ex.exception = ex_no;
7170 kvm_run->ex.error_code = error_code;
7171 break;
7172 }
7173 return 0;
7174}
7175
7176static int handle_external_interrupt(struct kvm_vcpu *vcpu)
7177{
7178 ++vcpu->stat.irq_exits;
7179 return 1;
7180}
7181
7182static int handle_triple_fault(struct kvm_vcpu *vcpu)
7183{
7184 vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
7185 vcpu->mmio_needed = 0;
7186 return 0;
7187}
7188
7189static int handle_io(struct kvm_vcpu *vcpu)
7190{
7191 unsigned long exit_qualification;
7192 int size, in, string;
7193 unsigned port;
7194
7195 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
7196 string = (exit_qualification & 16) != 0;
7197
7198 ++vcpu->stat.io_exits;
7199
7200 if (string)
7201 return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE;
7202
7203 port = exit_qualification >> 16;
7204 size = (exit_qualification & 7) + 1;
7205 in = (exit_qualification & 8) != 0;
7206
7207 return kvm_fast_pio(vcpu, size, port, in);
7208}
7209
7210static void
7211vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
7212{
7213 /*
7214 * Patch in the VMCALL instruction:
7215 */
7216 hypercall[0] = 0x0f;
7217 hypercall[1] = 0x01;
7218 hypercall[2] = 0xc1;
7219}
7220
7221/* called to set cr0 as appropriate for a mov-to-cr0 exit. */
7222static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
7223{
7224 if (is_guest_mode(vcpu)) {
7225 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
7226 unsigned long orig_val = val;
7227
7228 /*
7229 * We get here when L2 changed cr0 in a way that did not change
7230 * any of L1's shadowed bits (see nested_vmx_exit_handled_cr),
7231 * but did change L0 shadowed bits. So we first calculate the
7232 * effective cr0 value that L1 would like to write into the
7233 * hardware. It consists of the L2-owned bits from the new
7234 * value combined with the L1-owned bits from L1's guest_cr0.
7235 */
7236 val = (val & ~vmcs12->cr0_guest_host_mask) |
7237 (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask);
7238
7239 if (!nested_guest_cr0_valid(vcpu, val))
7240 return 1;
7241
7242 if (kvm_set_cr0(vcpu, val))
7243 return 1;
7244 vmcs_writel(CR0_READ_SHADOW, orig_val);
7245 return 0;
7246 } else {
7247 if (to_vmx(vcpu)->nested.vmxon &&
7248 !nested_host_cr0_valid(vcpu, val))
7249 return 1;
7250
7251 return kvm_set_cr0(vcpu, val);
7252 }
7253}
7254
7255static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val)
7256{
7257 if (is_guest_mode(vcpu)) {
7258 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
7259 unsigned long orig_val = val;
7260
7261 /* analogously to handle_set_cr0 */
7262 val = (val & ~vmcs12->cr4_guest_host_mask) |
7263 (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask);
7264 if (kvm_set_cr4(vcpu, val))
7265 return 1;
7266 vmcs_writel(CR4_READ_SHADOW, orig_val);
7267 return 0;
7268 } else
7269 return kvm_set_cr4(vcpu, val);
7270}
7271
7272static int handle_desc(struct kvm_vcpu *vcpu)
7273{
7274 WARN_ON(!(vcpu->arch.cr4 & X86_CR4_UMIP));
7275 return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE;
7276}
7277
7278static int handle_cr(struct kvm_vcpu *vcpu)
7279{
7280 unsigned long exit_qualification, val;
7281 int cr;
7282 int reg;
7283 int err;
7284 int ret;
7285
7286 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
7287 cr = exit_qualification & 15;
7288 reg = (exit_qualification >> 8) & 15;
7289 switch ((exit_qualification >> 4) & 3) {
7290 case 0: /* mov to cr */
7291 val = kvm_register_readl(vcpu, reg);
7292 trace_kvm_cr_write(cr, val);
7293 switch (cr) {
7294 case 0:
7295 err = handle_set_cr0(vcpu, val);
7296 return kvm_complete_insn_gp(vcpu, err);
7297 case 3:
7298 WARN_ON_ONCE(enable_unrestricted_guest);
7299 err = kvm_set_cr3(vcpu, val);
7300 return kvm_complete_insn_gp(vcpu, err);
7301 case 4:
7302 err = handle_set_cr4(vcpu, val);
7303 return kvm_complete_insn_gp(vcpu, err);
7304 case 8: {
7305 u8 cr8_prev = kvm_get_cr8(vcpu);
7306 u8 cr8 = (u8)val;
7307 err = kvm_set_cr8(vcpu, cr8);
7308 ret = kvm_complete_insn_gp(vcpu, err);
7309 if (lapic_in_kernel(vcpu))
7310 return ret;
7311 if (cr8_prev <= cr8)
7312 return ret;
7313 /*
7314 * TODO: we might be squashing a
7315 * KVM_GUESTDBG_SINGLESTEP-triggered
7316 * KVM_EXIT_DEBUG here.
7317 */
7318 vcpu->run->exit_reason = KVM_EXIT_SET_TPR;
7319 return 0;
7320 }
7321 }
7322 break;
7323 case 2: /* clts */
7324 WARN_ONCE(1, "Guest should always own CR0.TS");
7325 vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
7326 trace_kvm_cr_write(0, kvm_read_cr0(vcpu));
7327 return kvm_skip_emulated_instruction(vcpu);
7328 case 1: /*mov from cr*/
7329 switch (cr) {
7330 case 3:
7331 WARN_ON_ONCE(enable_unrestricted_guest);
7332 val = kvm_read_cr3(vcpu);
7333 kvm_register_write(vcpu, reg, val);
7334 trace_kvm_cr_read(cr, val);
7335 return kvm_skip_emulated_instruction(vcpu);
7336 case 8:
7337 val = kvm_get_cr8(vcpu);
7338 kvm_register_write(vcpu, reg, val);
7339 trace_kvm_cr_read(cr, val);
7340 return kvm_skip_emulated_instruction(vcpu);
7341 }
7342 break;
7343 case 3: /* lmsw */
7344 val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;
7345 trace_kvm_cr_write(0, (kvm_read_cr0(vcpu) & ~0xful) | val);
7346 kvm_lmsw(vcpu, val);
7347
7348 return kvm_skip_emulated_instruction(vcpu);
7349 default:
7350 break;
7351 }
7352 vcpu->run->exit_reason = 0;
7353 vcpu_unimpl(vcpu, "unhandled control register: op %d cr %d\n",
7354 (int)(exit_qualification >> 4) & 3, cr);
7355 return 0;
7356}
7357
7358static int handle_dr(struct kvm_vcpu *vcpu)
7359{
7360 unsigned long exit_qualification;
7361 int dr, dr7, reg;
7362
7363 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
7364 dr = exit_qualification & DEBUG_REG_ACCESS_NUM;
7365
7366 /* First, if DR does not exist, trigger UD */
7367 if (!kvm_require_dr(vcpu, dr))
7368 return 1;
7369
7370 /* Do not handle if the CPL > 0, will trigger GP on re-entry */
7371 if (!kvm_require_cpl(vcpu, 0))
7372 return 1;
7373 dr7 = vmcs_readl(GUEST_DR7);
7374 if (dr7 & DR7_GD) {
7375 /*
7376 * As the vm-exit takes precedence over the debug trap, we
7377 * need to emulate the latter, either for the host or the
7378 * guest debugging itself.
7379 */
7380 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
7381 vcpu->run->debug.arch.dr6 = vcpu->arch.dr6;
7382 vcpu->run->debug.arch.dr7 = dr7;
7383 vcpu->run->debug.arch.pc = kvm_get_linear_rip(vcpu);
7384 vcpu->run->debug.arch.exception = DB_VECTOR;
7385 vcpu->run->exit_reason = KVM_EXIT_DEBUG;
7386 return 0;
7387 } else {
7388 vcpu->arch.dr6 &= ~15;
7389 vcpu->arch.dr6 |= DR6_BD | DR6_RTM;
7390 kvm_queue_exception(vcpu, DB_VECTOR);
7391 return 1;
7392 }
7393 }
7394
7395 if (vcpu->guest_debug == 0) {
7396 vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
7397 CPU_BASED_MOV_DR_EXITING);
7398
7399 /*
7400 * No more DR vmexits; force a reload of the debug registers
7401 * and reenter on this instruction. The next vmexit will
7402 * retrieve the full state of the debug registers.
7403 */
7404 vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
7405 return 1;
7406 }
7407
7408 reg = DEBUG_REG_ACCESS_REG(exit_qualification);
7409 if (exit_qualification & TYPE_MOV_FROM_DR) {
7410 unsigned long val;
7411
7412 if (kvm_get_dr(vcpu, dr, &val))
7413 return 1;
7414 kvm_register_write(vcpu, reg, val);
7415 } else
7416 if (kvm_set_dr(vcpu, dr, kvm_register_readl(vcpu, reg)))
7417 return 1;
7418
7419 return kvm_skip_emulated_instruction(vcpu);
7420}
7421
7422static u64 vmx_get_dr6(struct kvm_vcpu *vcpu)
7423{
7424 return vcpu->arch.dr6;
7425}
7426
7427static void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val)
7428{
7429}
7430
7431static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
7432{
7433 get_debugreg(vcpu->arch.db[0], 0);
7434 get_debugreg(vcpu->arch.db[1], 1);
7435 get_debugreg(vcpu->arch.db[2], 2);
7436 get_debugreg(vcpu->arch.db[3], 3);
7437 get_debugreg(vcpu->arch.dr6, 6);
7438 vcpu->arch.dr7 = vmcs_readl(GUEST_DR7);
7439
7440 vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
7441 vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL, CPU_BASED_MOV_DR_EXITING);
7442}
7443
7444static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
7445{
7446 vmcs_writel(GUEST_DR7, val);
7447}
7448
7449static int handle_cpuid(struct kvm_vcpu *vcpu)
7450{
7451 return kvm_emulate_cpuid(vcpu);
7452}
7453
7454static int handle_rdmsr(struct kvm_vcpu *vcpu)
7455{
7456 u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
7457 struct msr_data msr_info;
7458
7459 msr_info.index = ecx;
7460 msr_info.host_initiated = false;
7461 if (vmx_get_msr(vcpu, &msr_info)) {
7462 trace_kvm_msr_read_ex(ecx);
7463 kvm_inject_gp(vcpu, 0);
7464 return 1;
7465 }
7466
7467 trace_kvm_msr_read(ecx, msr_info.data);
7468
7469 /* FIXME: handling of bits 32:63 of rax, rdx */
7470 vcpu->arch.regs[VCPU_REGS_RAX] = msr_info.data & -1u;
7471 vcpu->arch.regs[VCPU_REGS_RDX] = (msr_info.data >> 32) & -1u;
7472 return kvm_skip_emulated_instruction(vcpu);
7473}
7474
7475static int handle_wrmsr(struct kvm_vcpu *vcpu)
7476{
7477 struct msr_data msr;
7478 u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
7479 u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u)
7480 | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32);
7481
7482 msr.data = data;
7483 msr.index = ecx;
7484 msr.host_initiated = false;
7485 if (kvm_set_msr(vcpu, &msr) != 0) {
7486 trace_kvm_msr_write_ex(ecx, data);
7487 kvm_inject_gp(vcpu, 0);
7488 return 1;
7489 }
7490
7491 trace_kvm_msr_write(ecx, data);
7492 return kvm_skip_emulated_instruction(vcpu);
7493}
7494
7495static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu)
7496{
7497 kvm_apic_update_ppr(vcpu);
7498 return 1;
7499}
7500
7501static int handle_interrupt_window(struct kvm_vcpu *vcpu)
7502{
7503 vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
7504 CPU_BASED_VIRTUAL_INTR_PENDING);
7505
7506 kvm_make_request(KVM_REQ_EVENT, vcpu);
7507
7508 ++vcpu->stat.irq_window_exits;
7509 return 1;
7510}
7511
7512static int handle_halt(struct kvm_vcpu *vcpu)
7513{
7514 return kvm_emulate_halt(vcpu);
7515}
7516
7517static int handle_vmcall(struct kvm_vcpu *vcpu)
7518{
7519 return kvm_emulate_hypercall(vcpu);
7520}
7521
7522static int handle_invd(struct kvm_vcpu *vcpu)
7523{
7524 return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE;
7525}
7526
7527static int handle_invlpg(struct kvm_vcpu *vcpu)
7528{
7529 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
7530
7531 kvm_mmu_invlpg(vcpu, exit_qualification);
7532 return kvm_skip_emulated_instruction(vcpu);
7533}
7534
7535static int handle_rdpmc(struct kvm_vcpu *vcpu)
7536{
7537 int err;
7538
7539 err = kvm_rdpmc(vcpu);
7540 return kvm_complete_insn_gp(vcpu, err);
7541}
7542
7543static int handle_wbinvd(struct kvm_vcpu *vcpu)
7544{
7545 return kvm_emulate_wbinvd(vcpu);
7546}
7547
7548static int handle_xsetbv(struct kvm_vcpu *vcpu)
7549{
7550 u64 new_bv = kvm_read_edx_eax(vcpu);
7551 u32 index = kvm_register_read(vcpu, VCPU_REGS_RCX);
7552
7553 if (kvm_set_xcr(vcpu, index, new_bv) == 0)
7554 return kvm_skip_emulated_instruction(vcpu);
7555 return 1;
7556}
7557
7558static int handle_xsaves(struct kvm_vcpu *vcpu)
7559{
7560 kvm_skip_emulated_instruction(vcpu);
7561 WARN(1, "this should never happen\n");
7562 return 1;
7563}
7564
7565static int handle_xrstors(struct kvm_vcpu *vcpu)
7566{
7567 kvm_skip_emulated_instruction(vcpu);
7568 WARN(1, "this should never happen\n");
7569 return 1;
7570}
7571
7572static int handle_apic_access(struct kvm_vcpu *vcpu)
7573{
7574 if (likely(fasteoi)) {
7575 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
7576 int access_type, offset;
7577
7578 access_type = exit_qualification & APIC_ACCESS_TYPE;
7579 offset = exit_qualification & APIC_ACCESS_OFFSET;
7580 /*
7581 * Sane guest uses MOV to write EOI, with written value
7582 * not cared. So make a short-circuit here by avoiding
7583 * heavy instruction emulation.
7584 */
7585 if ((access_type == TYPE_LINEAR_APIC_INST_WRITE) &&
7586 (offset == APIC_EOI)) {
7587 kvm_lapic_set_eoi(vcpu);
7588 return kvm_skip_emulated_instruction(vcpu);
7589 }
7590 }
7591 return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE;
7592}
7593
7594static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu)
7595{
7596 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
7597 int vector = exit_qualification & 0xff;
7598
7599 /* EOI-induced VM exit is trap-like and thus no need to adjust IP */
7600 kvm_apic_set_eoi_accelerated(vcpu, vector);
7601 return 1;
7602}
7603
7604static int handle_apic_write(struct kvm_vcpu *vcpu)
7605{
7606 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
7607 u32 offset = exit_qualification & 0xfff;
7608
7609 /* APIC-write VM exit is trap-like and thus no need to adjust IP */
7610 kvm_apic_write_nodecode(vcpu, offset);
7611 return 1;
7612}
7613
7614static int handle_task_switch(struct kvm_vcpu *vcpu)
7615{
7616 struct vcpu_vmx *vmx = to_vmx(vcpu);
7617 unsigned long exit_qualification;
7618 bool has_error_code = false;
7619 u32 error_code = 0;
7620 u16 tss_selector;
7621 int reason, type, idt_v, idt_index;
7622
7623 idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
7624 idt_index = (vmx->idt_vectoring_info & VECTORING_INFO_VECTOR_MASK);
7625 type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK);
7626
7627 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
7628
7629 reason = (u32)exit_qualification >> 30;
7630 if (reason == TASK_SWITCH_GATE && idt_v) {
7631 switch (type) {
7632 case INTR_TYPE_NMI_INTR:
7633 vcpu->arch.nmi_injected = false;
7634 vmx_set_nmi_mask(vcpu, true);
7635 break;
7636 case INTR_TYPE_EXT_INTR:
7637 case INTR_TYPE_SOFT_INTR:
7638 kvm_clear_interrupt_queue(vcpu);
7639 break;
7640 case INTR_TYPE_HARD_EXCEPTION:
7641 if (vmx->idt_vectoring_info &
7642 VECTORING_INFO_DELIVER_CODE_MASK) {
7643 has_error_code = true;
7644 error_code =
7645 vmcs_read32(IDT_VECTORING_ERROR_CODE);
7646 }
7647 /* fall through */
7648 case INTR_TYPE_SOFT_EXCEPTION:
7649 kvm_clear_exception_queue(vcpu);
7650 break;
7651 default:
7652 break;
7653 }
7654 }
7655 tss_selector = exit_qualification;
7656
7657 if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION &&
7658 type != INTR_TYPE_EXT_INTR &&
7659 type != INTR_TYPE_NMI_INTR))
7660 skip_emulated_instruction(vcpu);
7661
7662 if (kvm_task_switch(vcpu, tss_selector,
7663 type == INTR_TYPE_SOFT_INTR ? idt_index : -1, reason,
7664 has_error_code, error_code) == EMULATE_FAIL) {
7665 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
7666 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
7667 vcpu->run->internal.ndata = 0;
7668 return 0;
7669 }
7670
7671 /*
7672 * TODO: What about debug traps on tss switch?
7673 * Are we supposed to inject them and update dr6?
7674 */
7675
7676 return 1;
7677}
7678
7679static int handle_ept_violation(struct kvm_vcpu *vcpu)
7680{
7681 unsigned long exit_qualification;
7682 gpa_t gpa;
7683 u64 error_code;
7684
7685 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
7686
7687 /*
7688 * EPT violation happened while executing iret from NMI,
7689 * "blocked by NMI" bit has to be set before next VM entry.
7690 * There are errata that may cause this bit to not be set:
7691 * AAK134, BY25.
7692 */
7693 if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
7694 enable_vnmi &&
7695 (exit_qualification & INTR_INFO_UNBLOCK_NMI))
7696 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI);
7697
7698 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
7699 trace_kvm_page_fault(gpa, exit_qualification);
7700
7701 /* Is it a read fault? */
7702 error_code = (exit_qualification & EPT_VIOLATION_ACC_READ)
7703 ? PFERR_USER_MASK : 0;
7704 /* Is it a write fault? */
7705 error_code |= (exit_qualification & EPT_VIOLATION_ACC_WRITE)
7706 ? PFERR_WRITE_MASK : 0;
7707 /* Is it a fetch fault? */
7708 error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR)
7709 ? PFERR_FETCH_MASK : 0;
7710 /* ept page table entry is present? */
7711 error_code |= (exit_qualification &
7712 (EPT_VIOLATION_READABLE | EPT_VIOLATION_WRITABLE |
7713 EPT_VIOLATION_EXECUTABLE))
7714 ? PFERR_PRESENT_MASK : 0;
7715
7716 error_code |= (exit_qualification & 0x100) != 0 ?
7717 PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK;
7718
7719 vcpu->arch.exit_qualification = exit_qualification;
7720 return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
7721}
7722
7723static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
7724{
7725 gpa_t gpa;
7726
7727 /*
7728 * A nested guest cannot optimize MMIO vmexits, because we have an
7729 * nGPA here instead of the required GPA.
7730 */
7731 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
7732 if (!is_guest_mode(vcpu) &&
7733 !kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
7734 trace_kvm_fast_mmio(gpa);
7735 /*
7736 * Doing kvm_skip_emulated_instruction() depends on undefined
7737 * behavior: Intel's manual doesn't mandate
7738 * VM_EXIT_INSTRUCTION_LEN to be set in VMCS when EPT MISCONFIG
7739 * occurs and while on real hardware it was observed to be set,
7740 * other hypervisors (namely Hyper-V) don't set it, we end up
7741 * advancing IP with some random value. Disable fast mmio when
7742 * running nested and keep it for real hardware in hope that
7743 * VM_EXIT_INSTRUCTION_LEN will always be set correctly.
7744 */
7745 if (!static_cpu_has(X86_FEATURE_HYPERVISOR))
7746 return kvm_skip_emulated_instruction(vcpu);
7747 else
7748 return kvm_emulate_instruction(vcpu, EMULTYPE_SKIP) ==
7749 EMULATE_DONE;
7750 }
7751
7752 return kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0);
7753}
7754
7755static int handle_nmi_window(struct kvm_vcpu *vcpu)
7756{
7757 WARN_ON_ONCE(!enable_vnmi);
7758 vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
7759 CPU_BASED_VIRTUAL_NMI_PENDING);
7760 ++vcpu->stat.nmi_window_exits;
7761 kvm_make_request(KVM_REQ_EVENT, vcpu);
7762
7763 return 1;
7764}
7765
7766static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
7767{
7768 struct vcpu_vmx *vmx = to_vmx(vcpu);
7769 enum emulation_result err = EMULATE_DONE;
7770 int ret = 1;
7771 u32 cpu_exec_ctrl;
7772 bool intr_window_requested;
7773 unsigned count = 130;
7774
7775 /*
7776 * We should never reach the point where we are emulating L2
7777 * due to invalid guest state as that means we incorrectly
7778 * allowed a nested VMEntry with an invalid vmcs12.
7779 */
7780 WARN_ON_ONCE(vmx->emulation_required && vmx->nested.nested_run_pending);
7781
7782 cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
7783 intr_window_requested = cpu_exec_ctrl & CPU_BASED_VIRTUAL_INTR_PENDING;
7784
7785 while (vmx->emulation_required && count-- != 0) {
7786 if (intr_window_requested && vmx_interrupt_allowed(vcpu))
7787 return handle_interrupt_window(&vmx->vcpu);
7788
7789 if (kvm_test_request(KVM_REQ_EVENT, vcpu))
7790 return 1;
7791
7792 err = kvm_emulate_instruction(vcpu, 0);
7793
7794 if (err == EMULATE_USER_EXIT) {
7795 ++vcpu->stat.mmio_exits;
7796 ret = 0;
7797 goto out;
7798 }
7799
7800 if (err != EMULATE_DONE)
7801 goto emulation_error;
7802
7803 if (vmx->emulation_required && !vmx->rmode.vm86_active &&
7804 vcpu->arch.exception.pending)
7805 goto emulation_error;
7806
7807 if (vcpu->arch.halt_request) {
7808 vcpu->arch.halt_request = 0;
7809 ret = kvm_vcpu_halt(vcpu);
7810 goto out;
7811 }
7812
7813 if (signal_pending(current))
7814 goto out;
7815 if (need_resched())
7816 schedule();
7817 }
7818
7819out:
7820 return ret;
7821
7822emulation_error:
7823 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
7824 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
7825 vcpu->run->internal.ndata = 0;
7826 return 0;
7827}
7828
7829static void grow_ple_window(struct kvm_vcpu *vcpu)
7830{
7831 struct vcpu_vmx *vmx = to_vmx(vcpu);
7832 int old = vmx->ple_window;
7833
7834 vmx->ple_window = __grow_ple_window(old, ple_window,
7835 ple_window_grow,
7836 ple_window_max);
7837
7838 if (vmx->ple_window != old)
7839 vmx->ple_window_dirty = true;
7840
7841 trace_kvm_ple_window_grow(vcpu->vcpu_id, vmx->ple_window, old);
7842}
7843
7844static void shrink_ple_window(struct kvm_vcpu *vcpu)
7845{
7846 struct vcpu_vmx *vmx = to_vmx(vcpu);
7847 int old = vmx->ple_window;
7848
7849 vmx->ple_window = __shrink_ple_window(old, ple_window,
7850 ple_window_shrink,
7851 ple_window);
7852
7853 if (vmx->ple_window != old)
7854 vmx->ple_window_dirty = true;
7855
7856 trace_kvm_ple_window_shrink(vcpu->vcpu_id, vmx->ple_window, old);
7857}
7858
7859/*
7860 * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR.
7861 */
7862static void wakeup_handler(void)
7863{
7864 struct kvm_vcpu *vcpu;
7865 int cpu = smp_processor_id();
7866
7867 spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
7868 list_for_each_entry(vcpu, &per_cpu(blocked_vcpu_on_cpu, cpu),
7869 blocked_vcpu_list) {
7870 struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
7871
7872 if (pi_test_on(pi_desc) == 1)
7873 kvm_vcpu_kick(vcpu);
7874 }
7875 spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
7876}
7877
7878static void vmx_enable_tdp(void)
7879{
7880 kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK,
7881 enable_ept_ad_bits ? VMX_EPT_ACCESS_BIT : 0ull,
7882 enable_ept_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull,
7883 0ull, VMX_EPT_EXECUTABLE_MASK,
7884 cpu_has_vmx_ept_execute_only() ? 0ull : VMX_EPT_READABLE_MASK,
7885 VMX_EPT_RWX_MASK, 0ull);
7886
7887 ept_set_mmio_spte_mask();
7888 kvm_enable_tdp();
7889}
7890
7891static __init int hardware_setup(void)
7892{
7893 unsigned long host_bndcfgs;
7894 int r = -ENOMEM, i;
7895
7896 rdmsrl_safe(MSR_EFER, &host_efer);
7897
7898 for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i)
7899 kvm_define_shared_msr(i, vmx_msr_index[i]);
7900
7901 for (i = 0; i < VMX_BITMAP_NR; i++) {
7902 vmx_bitmap[i] = (unsigned long *)__get_free_page(GFP_KERNEL);
7903 if (!vmx_bitmap[i])
7904 goto out;
7905 }
7906
7907 memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE);
7908 memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE);
7909
7910 if (setup_vmcs_config(&vmcs_config) < 0) {
7911 r = -EIO;
7912 goto out;
7913 }
7914
7915 if (boot_cpu_has(X86_FEATURE_NX))
7916 kvm_enable_efer_bits(EFER_NX);
7917
7918 if (boot_cpu_has(X86_FEATURE_MPX)) {
7919 rdmsrl(MSR_IA32_BNDCFGS, host_bndcfgs);
7920 WARN_ONCE(host_bndcfgs, "KVM: BNDCFGS in host will be lost");
7921 }
7922
7923 if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() ||
7924 !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global()))
7925 enable_vpid = 0;
7926
7927 if (!cpu_has_vmx_ept() ||
7928 !cpu_has_vmx_ept_4levels() ||
7929 !cpu_has_vmx_ept_mt_wb() ||
7930 !cpu_has_vmx_invept_global())
7931 enable_ept = 0;
7932
7933 if (!cpu_has_vmx_ept_ad_bits() || !enable_ept)
7934 enable_ept_ad_bits = 0;
7935
7936 if (!cpu_has_vmx_unrestricted_guest() || !enable_ept)
7937 enable_unrestricted_guest = 0;
7938
7939 if (!cpu_has_vmx_flexpriority())
7940 flexpriority_enabled = 0;
7941
7942 if (!cpu_has_virtual_nmis())
7943 enable_vnmi = 0;
7944
7945 /*
7946 * set_apic_access_page_addr() is used to reload apic access
7947 * page upon invalidation. No need to do anything if not
7948 * using the APIC_ACCESS_ADDR VMCS field.
7949 */
7950 if (!flexpriority_enabled)
7951 kvm_x86_ops->set_apic_access_page_addr = NULL;
7952
7953 if (!cpu_has_vmx_tpr_shadow())
7954 kvm_x86_ops->update_cr8_intercept = NULL;
7955
7956 if (enable_ept && !cpu_has_vmx_ept_2m_page())
7957 kvm_disable_largepages();
7958
7959#if IS_ENABLED(CONFIG_HYPERV)
7960 if (ms_hyperv.nested_features & HV_X64_NESTED_GUEST_MAPPING_FLUSH
7961 && enable_ept)
7962 kvm_x86_ops->tlb_remote_flush = vmx_hv_remote_flush_tlb;
7963#endif
7964
7965 if (!cpu_has_vmx_ple()) {
7966 ple_gap = 0;
7967 ple_window = 0;
7968 ple_window_grow = 0;
7969 ple_window_max = 0;
7970 ple_window_shrink = 0;
7971 }
7972
7973 if (!cpu_has_vmx_apicv()) {
7974 enable_apicv = 0;
7975 kvm_x86_ops->sync_pir_to_irr = NULL;
7976 }
7977
7978 if (cpu_has_vmx_tsc_scaling()) {
7979 kvm_has_tsc_control = true;
7980 kvm_max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX;
7981 kvm_tsc_scaling_ratio_frac_bits = 48;
7982 }
7983
7984 set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
7985
7986 if (enable_ept)
7987 vmx_enable_tdp();
7988 else
7989 kvm_disable_tdp();
7990
7991 if (!nested) {
7992 kvm_x86_ops->get_nested_state = NULL;
7993 kvm_x86_ops->set_nested_state = NULL;
7994 }
7995
7996 /*
7997 * Only enable PML when hardware supports PML feature, and both EPT
7998 * and EPT A/D bit features are enabled -- PML depends on them to work.
7999 */
8000 if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml())
8001 enable_pml = 0;
8002
8003 if (!enable_pml) {
8004 kvm_x86_ops->slot_enable_log_dirty = NULL;
8005 kvm_x86_ops->slot_disable_log_dirty = NULL;
8006 kvm_x86_ops->flush_log_dirty = NULL;
8007 kvm_x86_ops->enable_log_dirty_pt_masked = NULL;
8008 }
8009
8010 if (!cpu_has_vmx_preemption_timer())
8011 kvm_x86_ops->request_immediate_exit = __kvm_request_immediate_exit;
8012
8013 if (cpu_has_vmx_preemption_timer() && enable_preemption_timer) {
8014 u64 vmx_msr;
8015
8016 rdmsrl(MSR_IA32_VMX_MISC, vmx_msr);
8017 cpu_preemption_timer_multi =
8018 vmx_msr & VMX_MISC_PREEMPTION_TIMER_RATE_MASK;
8019 } else {
8020 kvm_x86_ops->set_hv_timer = NULL;
8021 kvm_x86_ops->cancel_hv_timer = NULL;
8022 }
8023
8024 if (!cpu_has_vmx_shadow_vmcs())
8025 enable_shadow_vmcs = 0;
8026 if (enable_shadow_vmcs)
8027 init_vmcs_shadow_fields();
8028
8029 kvm_set_posted_intr_wakeup_handler(wakeup_handler);
8030 nested_vmx_setup_ctls_msrs(&vmcs_config.nested, enable_apicv);
8031
8032 kvm_mce_cap_supported |= MCG_LMCE_P;
8033
8034 return alloc_kvm_area();
8035
8036out:
8037 for (i = 0; i < VMX_BITMAP_NR; i++)
8038 free_page((unsigned long)vmx_bitmap[i]);
8039
8040 return r;
8041}
8042
8043static __exit void hardware_unsetup(void)
8044{
8045 int i;
8046
8047 for (i = 0; i < VMX_BITMAP_NR; i++)
8048 free_page((unsigned long)vmx_bitmap[i]);
8049
8050 free_kvm_area();
8051}
8052
8053/*
8054 * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE
8055 * exiting, so only get here on cpu with PAUSE-Loop-Exiting.
8056 */
8057static int handle_pause(struct kvm_vcpu *vcpu)
8058{
8059 if (!kvm_pause_in_guest(vcpu->kvm))
8060 grow_ple_window(vcpu);
8061
8062 /*
8063 * Intel sdm vol3 ch-25.1.3 says: The "PAUSE-loop exiting"
8064 * VM-execution control is ignored if CPL > 0. OTOH, KVM
8065 * never set PAUSE_EXITING and just set PLE if supported,
8066 * so the vcpu must be CPL=0 if it gets a PAUSE exit.
8067 */
8068 kvm_vcpu_on_spin(vcpu, true);
8069 return kvm_skip_emulated_instruction(vcpu);
8070}
8071
8072static int handle_nop(struct kvm_vcpu *vcpu)
8073{
8074 return kvm_skip_emulated_instruction(vcpu);
8075}
8076
8077static int handle_mwait(struct kvm_vcpu *vcpu)
8078{
8079 printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n");
8080 return handle_nop(vcpu);
8081}
8082
8083static int handle_invalid_op(struct kvm_vcpu *vcpu)
8084{
8085 kvm_queue_exception(vcpu, UD_VECTOR);
8086 return 1;
8087}
8088
8089static int handle_monitor_trap(struct kvm_vcpu *vcpu)
8090{
8091 return 1;
8092}
8093
8094static int handle_monitor(struct kvm_vcpu *vcpu)
8095{
8096 printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n");
8097 return handle_nop(vcpu);
8098}
8099
8100/*
8101 * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
8102 * set the success or error code of an emulated VMX instruction (as specified
8103 * by Vol 2B, VMX Instruction Reference, "Conventions"), and skip the emulated
8104 * instruction.
8105 */
8106static int nested_vmx_succeed(struct kvm_vcpu *vcpu)
8107{
8108 vmx_set_rflags(vcpu, vmx_get_rflags(vcpu)
8109 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
8110 X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF));
8111 return kvm_skip_emulated_instruction(vcpu);
8112}
8113
8114static int nested_vmx_failInvalid(struct kvm_vcpu *vcpu)
8115{
8116 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
8117 & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF |
8118 X86_EFLAGS_SF | X86_EFLAGS_OF))
8119 | X86_EFLAGS_CF);
8120 return kvm_skip_emulated_instruction(vcpu);
8121}
8122
8123static int nested_vmx_failValid(struct kvm_vcpu *vcpu,
8124 u32 vm_instruction_error)
8125{
8126 struct vcpu_vmx *vmx = to_vmx(vcpu);
8127
8128 /*
8129 * failValid writes the error number to the current VMCS, which
8130 * can't be done if there isn't a current VMCS.
8131 */
8132 if (vmx->nested.current_vmptr == -1ull && !vmx->nested.hv_evmcs)
8133 return nested_vmx_failInvalid(vcpu);
8134
8135 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
8136 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
8137 X86_EFLAGS_SF | X86_EFLAGS_OF))
8138 | X86_EFLAGS_ZF);
8139 get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error;
8140 /*
8141 * We don't need to force a shadow sync because
8142 * VM_INSTRUCTION_ERROR is not shadowed
8143 */
8144 return kvm_skip_emulated_instruction(vcpu);
8145}
8146
8147static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator)
8148{
8149 /* TODO: not to reset guest simply here. */
8150 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
8151 pr_debug_ratelimited("kvm: nested vmx abort, indicator %d\n", indicator);
8152}
8153
8154static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer)
8155{
8156 struct vcpu_vmx *vmx =
8157 container_of(timer, struct vcpu_vmx, nested.preemption_timer);
8158
8159 vmx->nested.preemption_timer_expired = true;
8160 kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu);
8161 kvm_vcpu_kick(&vmx->vcpu);
8162
8163 return HRTIMER_NORESTART;
8164}
8165
8166/*
8167 * Decode the memory-address operand of a vmx instruction, as recorded on an
8168 * exit caused by such an instruction (run by a guest hypervisor).
8169 * On success, returns 0. When the operand is invalid, returns 1 and throws
8170 * #UD or #GP.
8171 */
8172static int get_vmx_mem_address(struct kvm_vcpu *vcpu,
8173 unsigned long exit_qualification,
8174 u32 vmx_instruction_info, bool wr, gva_t *ret)
8175{
8176 gva_t off;
8177 bool exn;
8178 struct kvm_segment s;
8179
8180 /*
8181 * According to Vol. 3B, "Information for VM Exits Due to Instruction
8182 * Execution", on an exit, vmx_instruction_info holds most of the
8183 * addressing components of the operand. Only the displacement part
8184 * is put in exit_qualification (see 3B, "Basic VM-Exit Information").
8185 * For how an actual address is calculated from all these components,
8186 * refer to Vol. 1, "Operand Addressing".
8187 */
8188 int scaling = vmx_instruction_info & 3;
8189 int addr_size = (vmx_instruction_info >> 7) & 7;
8190 bool is_reg = vmx_instruction_info & (1u << 10);
8191 int seg_reg = (vmx_instruction_info >> 15) & 7;
8192 int index_reg = (vmx_instruction_info >> 18) & 0xf;
8193 bool index_is_valid = !(vmx_instruction_info & (1u << 22));
8194 int base_reg = (vmx_instruction_info >> 23) & 0xf;
8195 bool base_is_valid = !(vmx_instruction_info & (1u << 27));
8196
8197 if (is_reg) {
8198 kvm_queue_exception(vcpu, UD_VECTOR);
8199 return 1;
8200 }
8201
8202 /* Addr = segment_base + offset */
8203 /* offset = base + [index * scale] + displacement */
8204 off = exit_qualification; /* holds the displacement */
8205 if (base_is_valid)
8206 off += kvm_register_read(vcpu, base_reg);
8207 if (index_is_valid)
8208 off += kvm_register_read(vcpu, index_reg)<<scaling;
8209 vmx_get_segment(vcpu, &s, seg_reg);
8210 *ret = s.base + off;
8211
8212 if (addr_size == 1) /* 32 bit */
8213 *ret &= 0xffffffff;
8214
8215 /* Checks for #GP/#SS exceptions. */
8216 exn = false;
8217 if (is_long_mode(vcpu)) {
8218 /* Long mode: #GP(0)/#SS(0) if the memory address is in a
8219 * non-canonical form. This is the only check on the memory
8220 * destination for long mode!
8221 */
8222 exn = is_noncanonical_address(*ret, vcpu);
8223 } else if (is_protmode(vcpu)) {
8224 /* Protected mode: apply checks for segment validity in the
8225 * following order:
8226 * - segment type check (#GP(0) may be thrown)
8227 * - usability check (#GP(0)/#SS(0))
8228 * - limit check (#GP(0)/#SS(0))
8229 */
8230 if (wr)
8231 /* #GP(0) if the destination operand is located in a
8232 * read-only data segment or any code segment.
8233 */
8234 exn = ((s.type & 0xa) == 0 || (s.type & 8));
8235 else
8236 /* #GP(0) if the source operand is located in an
8237 * execute-only code segment
8238 */
8239 exn = ((s.type & 0xa) == 8);
8240 if (exn) {
8241 kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
8242 return 1;
8243 }
8244 /* Protected mode: #GP(0)/#SS(0) if the segment is unusable.
8245 */
8246 exn = (s.unusable != 0);
8247 /* Protected mode: #GP(0)/#SS(0) if the memory
8248 * operand is outside the segment limit.
8249 */
8250 exn = exn || (off + sizeof(u64) > s.limit);
8251 }
8252 if (exn) {
8253 kvm_queue_exception_e(vcpu,
8254 seg_reg == VCPU_SREG_SS ?
8255 SS_VECTOR : GP_VECTOR,
8256 0);
8257 return 1;
8258 }
8259
8260 return 0;
8261}
8262
8263static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer)
8264{
8265 gva_t gva;
8266 struct x86_exception e;
8267
8268 if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
8269 vmcs_read32(VMX_INSTRUCTION_INFO), false, &gva))
8270 return 1;
8271
8272 if (kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e)) {
8273 kvm_inject_page_fault(vcpu, &e);
8274 return 1;
8275 }
8276
8277 return 0;
8278}
8279
8280/*
8281 * Allocate a shadow VMCS and associate it with the currently loaded
8282 * VMCS, unless such a shadow VMCS already exists. The newly allocated
8283 * VMCS is also VMCLEARed, so that it is ready for use.
8284 */
8285static struct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu)
8286{
8287 struct vcpu_vmx *vmx = to_vmx(vcpu);
8288 struct loaded_vmcs *loaded_vmcs = vmx->loaded_vmcs;
8289
8290 /*
8291 * We should allocate a shadow vmcs for vmcs01 only when L1
8292 * executes VMXON and free it when L1 executes VMXOFF.
8293 * As it is invalid to execute VMXON twice, we shouldn't reach
8294 * here when vmcs01 already have an allocated shadow vmcs.
8295 */
8296 WARN_ON(loaded_vmcs == &vmx->vmcs01 && loaded_vmcs->shadow_vmcs);
8297
8298 if (!loaded_vmcs->shadow_vmcs) {
8299 loaded_vmcs->shadow_vmcs = alloc_vmcs(true);
8300 if (loaded_vmcs->shadow_vmcs)
8301 vmcs_clear(loaded_vmcs->shadow_vmcs);
8302 }
8303 return loaded_vmcs->shadow_vmcs;
8304}
8305
8306static int enter_vmx_operation(struct kvm_vcpu *vcpu)
8307{
8308 struct vcpu_vmx *vmx = to_vmx(vcpu);
8309 int r;
8310
8311 r = alloc_loaded_vmcs(&vmx->nested.vmcs02);
8312 if (r < 0)
8313 goto out_vmcs02;
8314
8315 vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL);
8316 if (!vmx->nested.cached_vmcs12)
8317 goto out_cached_vmcs12;
8318
8319 vmx->nested.cached_shadow_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL);
8320 if (!vmx->nested.cached_shadow_vmcs12)
8321 goto out_cached_shadow_vmcs12;
8322
8323 if (enable_shadow_vmcs && !alloc_shadow_vmcs(vcpu))
8324 goto out_shadow_vmcs;
8325
8326 hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC,
8327 HRTIMER_MODE_REL_PINNED);
8328 vmx->nested.preemption_timer.function = vmx_preemption_timer_fn;
8329
8330 vmx->nested.vpid02 = allocate_vpid();
8331
8332 vmx->nested.vmcs02_initialized = false;
8333 vmx->nested.vmxon = true;
8334 return 0;
8335
8336out_shadow_vmcs:
8337 kfree(vmx->nested.cached_shadow_vmcs12);
8338
8339out_cached_shadow_vmcs12:
8340 kfree(vmx->nested.cached_vmcs12);
8341
8342out_cached_vmcs12:
8343 free_loaded_vmcs(&vmx->nested.vmcs02);
8344
8345out_vmcs02:
8346 return -ENOMEM;
8347}
8348
8349/*
8350 * Emulate the VMXON instruction.
8351 * Currently, we just remember that VMX is active, and do not save or even
8352 * inspect the argument to VMXON (the so-called "VMXON pointer") because we
8353 * do not currently need to store anything in that guest-allocated memory
8354 * region. Consequently, VMCLEAR and VMPTRLD also do not verify that the their
8355 * argument is different from the VMXON pointer (which the spec says they do).
8356 */
8357static int handle_vmon(struct kvm_vcpu *vcpu)
8358{
8359 int ret;
8360 gpa_t vmptr;
8361 struct page *page;
8362 struct vcpu_vmx *vmx = to_vmx(vcpu);
8363 const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED
8364 | FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
8365
8366 /*
8367 * The Intel VMX Instruction Reference lists a bunch of bits that are
8368 * prerequisite to running VMXON, most notably cr4.VMXE must be set to
8369 * 1 (see vmx_set_cr4() for when we allow the guest to set this).
8370 * Otherwise, we should fail with #UD. But most faulting conditions
8371 * have already been checked by hardware, prior to the VM-exit for
8372 * VMXON. We do test guest cr4.VMXE because processor CR4 always has
8373 * that bit set to 1 in non-root mode.
8374 */
8375 if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE)) {
8376 kvm_queue_exception(vcpu, UD_VECTOR);
8377 return 1;
8378 }
8379
8380 /* CPL=0 must be checked manually. */
8381 if (vmx_get_cpl(vcpu)) {
8382 kvm_inject_gp(vcpu, 0);
8383 return 1;
8384 }
8385
8386 if (vmx->nested.vmxon)
8387 return nested_vmx_failValid(vcpu,
8388 VMXERR_VMXON_IN_VMX_ROOT_OPERATION);
8389
8390 if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES)
8391 != VMXON_NEEDED_FEATURES) {
8392 kvm_inject_gp(vcpu, 0);
8393 return 1;
8394 }
8395
8396 if (nested_vmx_get_vmptr(vcpu, &vmptr))
8397 return 1;
8398
8399 /*
8400 * SDM 3: 24.11.5
8401 * The first 4 bytes of VMXON region contain the supported
8402 * VMCS revision identifier
8403 *
8404 * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case;
8405 * which replaces physical address width with 32
8406 */
8407 if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu)))
8408 return nested_vmx_failInvalid(vcpu);
8409
8410 page = kvm_vcpu_gpa_to_page(vcpu, vmptr);
8411 if (is_error_page(page))
8412 return nested_vmx_failInvalid(vcpu);
8413
8414 if (*(u32 *)kmap(page) != VMCS12_REVISION) {
8415 kunmap(page);
8416 kvm_release_page_clean(page);
8417 return nested_vmx_failInvalid(vcpu);
8418 }
8419 kunmap(page);
8420 kvm_release_page_clean(page);
8421
8422 vmx->nested.vmxon_ptr = vmptr;
8423 ret = enter_vmx_operation(vcpu);
8424 if (ret)
8425 return ret;
8426
8427 return nested_vmx_succeed(vcpu);
8428}
8429
8430/*
8431 * Intel's VMX Instruction Reference specifies a common set of prerequisites
8432 * for running VMX instructions (except VMXON, whose prerequisites are
8433 * slightly different). It also specifies what exception to inject otherwise.
8434 * Note that many of these exceptions have priority over VM exits, so they
8435 * don't have to be checked again here.
8436 */
8437static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
8438{
8439 if (!to_vmx(vcpu)->nested.vmxon) {
8440 kvm_queue_exception(vcpu, UD_VECTOR);
8441 return 0;
8442 }
8443
8444 if (vmx_get_cpl(vcpu)) {
8445 kvm_inject_gp(vcpu, 0);
8446 return 0;
8447 }
8448
8449 return 1;
8450}
8451
8452static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx)
8453{
8454 vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, SECONDARY_EXEC_SHADOW_VMCS);
8455 vmcs_write64(VMCS_LINK_POINTER, -1ull);
8456}
8457
8458static inline void nested_release_evmcs(struct kvm_vcpu *vcpu)
8459{
8460 struct vcpu_vmx *vmx = to_vmx(vcpu);
8461
8462 if (!vmx->nested.hv_evmcs)
8463 return;
8464
8465 kunmap(vmx->nested.hv_evmcs_page);
8466 kvm_release_page_dirty(vmx->nested.hv_evmcs_page);
8467 vmx->nested.hv_evmcs_vmptr = -1ull;
8468 vmx->nested.hv_evmcs_page = NULL;
8469 vmx->nested.hv_evmcs = NULL;
8470}
8471
8472static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu)
8473{
8474 struct vcpu_vmx *vmx = to_vmx(vcpu);
8475
8476 if (vmx->nested.current_vmptr == -1ull)
8477 return;
8478
8479 if (enable_shadow_vmcs) {
8480 /* copy to memory all shadowed fields in case
8481 they were modified */
8482 copy_shadow_to_vmcs12(vmx);
8483 vmx->nested.need_vmcs12_sync = false;
8484 vmx_disable_shadow_vmcs(vmx);
8485 }
8486 vmx->nested.posted_intr_nv = -1;
8487
8488 /* Flush VMCS12 to guest memory */
8489 kvm_vcpu_write_guest_page(vcpu,
8490 vmx->nested.current_vmptr >> PAGE_SHIFT,
8491 vmx->nested.cached_vmcs12, 0, VMCS12_SIZE);
8492
8493 kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
8494
8495 vmx->nested.current_vmptr = -1ull;
8496}
8497
8498/*
8499 * Free whatever needs to be freed from vmx->nested when L1 goes down, or
8500 * just stops using VMX.
8501 */
8502static void free_nested(struct kvm_vcpu *vcpu)
8503{
8504 struct vcpu_vmx *vmx = to_vmx(vcpu);
8505
8506 if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon)
8507 return;
8508
8509 vmx->nested.vmxon = false;
8510 vmx->nested.smm.vmxon = false;
8511 free_vpid(vmx->nested.vpid02);
8512 vmx->nested.posted_intr_nv = -1;
8513 vmx->nested.current_vmptr = -1ull;
8514 if (enable_shadow_vmcs) {
8515 vmx_disable_shadow_vmcs(vmx);
8516 vmcs_clear(vmx->vmcs01.shadow_vmcs);
8517 free_vmcs(vmx->vmcs01.shadow_vmcs);
8518 vmx->vmcs01.shadow_vmcs = NULL;
8519 }
8520 kfree(vmx->nested.cached_vmcs12);
8521 kfree(vmx->nested.cached_shadow_vmcs12);
8522 /* Unpin physical memory we referred to in the vmcs02 */
8523 if (vmx->nested.apic_access_page) {
8524 kvm_release_page_dirty(vmx->nested.apic_access_page);
8525 vmx->nested.apic_access_page = NULL;
8526 }
8527 if (vmx->nested.virtual_apic_page) {
8528 kvm_release_page_dirty(vmx->nested.virtual_apic_page);
8529 vmx->nested.virtual_apic_page = NULL;
8530 }
8531 if (vmx->nested.pi_desc_page) {
8532 kunmap(vmx->nested.pi_desc_page);
8533 kvm_release_page_dirty(vmx->nested.pi_desc_page);
8534 vmx->nested.pi_desc_page = NULL;
8535 vmx->nested.pi_desc = NULL;
8536 }
8537
8538 kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
8539
8540 nested_release_evmcs(vcpu);
8541
8542 free_loaded_vmcs(&vmx->nested.vmcs02);
8543}
8544
8545/* Emulate the VMXOFF instruction */
8546static int handle_vmoff(struct kvm_vcpu *vcpu)
8547{
8548 if (!nested_vmx_check_permission(vcpu))
8549 return 1;
8550 free_nested(vcpu);
8551 return nested_vmx_succeed(vcpu);
8552}
8553
8554/* Emulate the VMCLEAR instruction */
8555static int handle_vmclear(struct kvm_vcpu *vcpu)
8556{
8557 struct vcpu_vmx *vmx = to_vmx(vcpu);
8558 u32 zero = 0;
8559 gpa_t vmptr;
8560
8561 if (!nested_vmx_check_permission(vcpu))
8562 return 1;
8563
8564 if (nested_vmx_get_vmptr(vcpu, &vmptr))
8565 return 1;
8566
8567 if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu)))
8568 return nested_vmx_failValid(vcpu,
8569 VMXERR_VMCLEAR_INVALID_ADDRESS);
8570
8571 if (vmptr == vmx->nested.vmxon_ptr)
8572 return nested_vmx_failValid(vcpu,
8573 VMXERR_VMCLEAR_VMXON_POINTER);
8574
8575 if (vmx->nested.hv_evmcs_page) {
8576 if (vmptr == vmx->nested.hv_evmcs_vmptr)
8577 nested_release_evmcs(vcpu);
8578 } else {
8579 if (vmptr == vmx->nested.current_vmptr)
8580 nested_release_vmcs12(vcpu);
8581
8582 kvm_vcpu_write_guest(vcpu,
8583 vmptr + offsetof(struct vmcs12,
8584 launch_state),
8585 &zero, sizeof(zero));
8586 }
8587
8588 return nested_vmx_succeed(vcpu);
8589}
8590
8591static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch);
8592
8593/* Emulate the VMLAUNCH instruction */
8594static int handle_vmlaunch(struct kvm_vcpu *vcpu)
8595{
8596 return nested_vmx_run(vcpu, true);
8597}
8598
8599/* Emulate the VMRESUME instruction */
8600static int handle_vmresume(struct kvm_vcpu *vcpu)
8601{
8602
8603 return nested_vmx_run(vcpu, false);
8604}
8605
8606/*
8607 * Read a vmcs12 field. Since these can have varying lengths and we return
8608 * one type, we chose the biggest type (u64) and zero-extend the return value
8609 * to that size. Note that the caller, handle_vmread, might need to use only
8610 * some of the bits we return here (e.g., on 32-bit guests, only 32 bits of
8611 * 64-bit fields are to be returned).
8612 */
8613static inline int vmcs12_read_any(struct vmcs12 *vmcs12,
8614 unsigned long field, u64 *ret)
8615{
8616 short offset = vmcs_field_to_offset(field);
8617 char *p;
8618
8619 if (offset < 0)
8620 return offset;
8621
8622 p = (char *)vmcs12 + offset;
8623
8624 switch (vmcs_field_width(field)) {
8625 case VMCS_FIELD_WIDTH_NATURAL_WIDTH:
8626 *ret = *((natural_width *)p);
8627 return 0;
8628 case VMCS_FIELD_WIDTH_U16:
8629 *ret = *((u16 *)p);
8630 return 0;
8631 case VMCS_FIELD_WIDTH_U32:
8632 *ret = *((u32 *)p);
8633 return 0;
8634 case VMCS_FIELD_WIDTH_U64:
8635 *ret = *((u64 *)p);
8636 return 0;
8637 default:
8638 WARN_ON(1);
8639 return -ENOENT;
8640 }
8641}
8642
8643
8644static inline int vmcs12_write_any(struct vmcs12 *vmcs12,
8645 unsigned long field, u64 field_value){
8646 short offset = vmcs_field_to_offset(field);
8647 char *p = (char *)vmcs12 + offset;
8648 if (offset < 0)
8649 return offset;
8650
8651 switch (vmcs_field_width(field)) {
8652 case VMCS_FIELD_WIDTH_U16:
8653 *(u16 *)p = field_value;
8654 return 0;
8655 case VMCS_FIELD_WIDTH_U32:
8656 *(u32 *)p = field_value;
8657 return 0;
8658 case VMCS_FIELD_WIDTH_U64:
8659 *(u64 *)p = field_value;
8660 return 0;
8661 case VMCS_FIELD_WIDTH_NATURAL_WIDTH:
8662 *(natural_width *)p = field_value;
8663 return 0;
8664 default:
8665 WARN_ON(1);
8666 return -ENOENT;
8667 }
8668
8669}
8670
8671static int copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx)
8672{
8673 struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12;
8674 struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs;
8675
8676 /* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */
8677 vmcs12->tpr_threshold = evmcs->tpr_threshold;
8678 vmcs12->guest_rip = evmcs->guest_rip;
8679
8680 if (unlikely(!(evmcs->hv_clean_fields &
8681 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC))) {
8682 vmcs12->guest_rsp = evmcs->guest_rsp;
8683 vmcs12->guest_rflags = evmcs->guest_rflags;
8684 vmcs12->guest_interruptibility_info =
8685 evmcs->guest_interruptibility_info;
8686 }
8687
8688 if (unlikely(!(evmcs->hv_clean_fields &
8689 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) {
8690 vmcs12->cpu_based_vm_exec_control =
8691 evmcs->cpu_based_vm_exec_control;
8692 }
8693
8694 if (unlikely(!(evmcs->hv_clean_fields &
8695 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) {
8696 vmcs12->exception_bitmap = evmcs->exception_bitmap;
8697 }
8698
8699 if (unlikely(!(evmcs->hv_clean_fields &
8700 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY))) {
8701 vmcs12->vm_entry_controls = evmcs->vm_entry_controls;
8702 }
8703
8704 if (unlikely(!(evmcs->hv_clean_fields &
8705 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT))) {
8706 vmcs12->vm_entry_intr_info_field =
8707 evmcs->vm_entry_intr_info_field;
8708 vmcs12->vm_entry_exception_error_code =
8709 evmcs->vm_entry_exception_error_code;
8710 vmcs12->vm_entry_instruction_len =
8711 evmcs->vm_entry_instruction_len;
8712 }
8713
8714 if (unlikely(!(evmcs->hv_clean_fields &
8715 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) {
8716 vmcs12->host_ia32_pat = evmcs->host_ia32_pat;
8717 vmcs12->host_ia32_efer = evmcs->host_ia32_efer;
8718 vmcs12->host_cr0 = evmcs->host_cr0;
8719 vmcs12->host_cr3 = evmcs->host_cr3;
8720 vmcs12->host_cr4 = evmcs->host_cr4;
8721 vmcs12->host_ia32_sysenter_esp = evmcs->host_ia32_sysenter_esp;
8722 vmcs12->host_ia32_sysenter_eip = evmcs->host_ia32_sysenter_eip;
8723 vmcs12->host_rip = evmcs->host_rip;
8724 vmcs12->host_ia32_sysenter_cs = evmcs->host_ia32_sysenter_cs;
8725 vmcs12->host_es_selector = evmcs->host_es_selector;
8726 vmcs12->host_cs_selector = evmcs->host_cs_selector;
8727 vmcs12->host_ss_selector = evmcs->host_ss_selector;
8728 vmcs12->host_ds_selector = evmcs->host_ds_selector;
8729 vmcs12->host_fs_selector = evmcs->host_fs_selector;
8730 vmcs12->host_gs_selector = evmcs->host_gs_selector;
8731 vmcs12->host_tr_selector = evmcs->host_tr_selector;
8732 }
8733
8734 if (unlikely(!(evmcs->hv_clean_fields &
8735 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) {
8736 vmcs12->pin_based_vm_exec_control =
8737 evmcs->pin_based_vm_exec_control;
8738 vmcs12->vm_exit_controls = evmcs->vm_exit_controls;
8739 vmcs12->secondary_vm_exec_control =
8740 evmcs->secondary_vm_exec_control;
8741 }
8742
8743 if (unlikely(!(evmcs->hv_clean_fields &
8744 HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP))) {
8745 vmcs12->io_bitmap_a = evmcs->io_bitmap_a;
8746 vmcs12->io_bitmap_b = evmcs->io_bitmap_b;
8747 }
8748
8749 if (unlikely(!(evmcs->hv_clean_fields &
8750 HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP))) {
8751 vmcs12->msr_bitmap = evmcs->msr_bitmap;
8752 }
8753
8754 if (unlikely(!(evmcs->hv_clean_fields &
8755 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2))) {
8756 vmcs12->guest_es_base = evmcs->guest_es_base;
8757 vmcs12->guest_cs_base = evmcs->guest_cs_base;
8758 vmcs12->guest_ss_base = evmcs->guest_ss_base;
8759 vmcs12->guest_ds_base = evmcs->guest_ds_base;
8760 vmcs12->guest_fs_base = evmcs->guest_fs_base;
8761 vmcs12->guest_gs_base = evmcs->guest_gs_base;
8762 vmcs12->guest_ldtr_base = evmcs->guest_ldtr_base;
8763 vmcs12->guest_tr_base = evmcs->guest_tr_base;
8764 vmcs12->guest_gdtr_base = evmcs->guest_gdtr_base;
8765 vmcs12->guest_idtr_base = evmcs->guest_idtr_base;
8766 vmcs12->guest_es_limit = evmcs->guest_es_limit;
8767 vmcs12->guest_cs_limit = evmcs->guest_cs_limit;
8768 vmcs12->guest_ss_limit = evmcs->guest_ss_limit;
8769 vmcs12->guest_ds_limit = evmcs->guest_ds_limit;
8770 vmcs12->guest_fs_limit = evmcs->guest_fs_limit;
8771 vmcs12->guest_gs_limit = evmcs->guest_gs_limit;
8772 vmcs12->guest_ldtr_limit = evmcs->guest_ldtr_limit;
8773 vmcs12->guest_tr_limit = evmcs->guest_tr_limit;
8774 vmcs12->guest_gdtr_limit = evmcs->guest_gdtr_limit;
8775 vmcs12->guest_idtr_limit = evmcs->guest_idtr_limit;
8776 vmcs12->guest_es_ar_bytes = evmcs->guest_es_ar_bytes;
8777 vmcs12->guest_cs_ar_bytes = evmcs->guest_cs_ar_bytes;
8778 vmcs12->guest_ss_ar_bytes = evmcs->guest_ss_ar_bytes;
8779 vmcs12->guest_ds_ar_bytes = evmcs->guest_ds_ar_bytes;
8780 vmcs12->guest_fs_ar_bytes = evmcs->guest_fs_ar_bytes;
8781 vmcs12->guest_gs_ar_bytes = evmcs->guest_gs_ar_bytes;
8782 vmcs12->guest_ldtr_ar_bytes = evmcs->guest_ldtr_ar_bytes;
8783 vmcs12->guest_tr_ar_bytes = evmcs->guest_tr_ar_bytes;
8784 vmcs12->guest_es_selector = evmcs->guest_es_selector;
8785 vmcs12->guest_cs_selector = evmcs->guest_cs_selector;
8786 vmcs12->guest_ss_selector = evmcs->guest_ss_selector;
8787 vmcs12->guest_ds_selector = evmcs->guest_ds_selector;
8788 vmcs12->guest_fs_selector = evmcs->guest_fs_selector;
8789 vmcs12->guest_gs_selector = evmcs->guest_gs_selector;
8790 vmcs12->guest_ldtr_selector = evmcs->guest_ldtr_selector;
8791 vmcs12->guest_tr_selector = evmcs->guest_tr_selector;
8792 }
8793
8794 if (unlikely(!(evmcs->hv_clean_fields &
8795 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2))) {
8796 vmcs12->tsc_offset = evmcs->tsc_offset;
8797 vmcs12->virtual_apic_page_addr = evmcs->virtual_apic_page_addr;
8798 vmcs12->xss_exit_bitmap = evmcs->xss_exit_bitmap;
8799 }
8800
8801 if (unlikely(!(evmcs->hv_clean_fields &
8802 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR))) {
8803 vmcs12->cr0_guest_host_mask = evmcs->cr0_guest_host_mask;
8804 vmcs12->cr4_guest_host_mask = evmcs->cr4_guest_host_mask;
8805 vmcs12->cr0_read_shadow = evmcs->cr0_read_shadow;
8806 vmcs12->cr4_read_shadow = evmcs->cr4_read_shadow;
8807 vmcs12->guest_cr0 = evmcs->guest_cr0;
8808 vmcs12->guest_cr3 = evmcs->guest_cr3;
8809 vmcs12->guest_cr4 = evmcs->guest_cr4;
8810 vmcs12->guest_dr7 = evmcs->guest_dr7;
8811 }
8812
8813 if (unlikely(!(evmcs->hv_clean_fields &
8814 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER))) {
8815 vmcs12->host_fs_base = evmcs->host_fs_base;
8816 vmcs12->host_gs_base = evmcs->host_gs_base;
8817 vmcs12->host_tr_base = evmcs->host_tr_base;
8818 vmcs12->host_gdtr_base = evmcs->host_gdtr_base;
8819 vmcs12->host_idtr_base = evmcs->host_idtr_base;
8820 vmcs12->host_rsp = evmcs->host_rsp;
8821 }
8822
8823 if (unlikely(!(evmcs->hv_clean_fields &
8824 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT))) {
8825 vmcs12->ept_pointer = evmcs->ept_pointer;
8826 vmcs12->virtual_processor_id = evmcs->virtual_processor_id;
8827 }
8828
8829 if (unlikely(!(evmcs->hv_clean_fields &
8830 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1))) {
8831 vmcs12->vmcs_link_pointer = evmcs->vmcs_link_pointer;
8832 vmcs12->guest_ia32_debugctl = evmcs->guest_ia32_debugctl;
8833 vmcs12->guest_ia32_pat = evmcs->guest_ia32_pat;
8834 vmcs12->guest_ia32_efer = evmcs->guest_ia32_efer;
8835 vmcs12->guest_pdptr0 = evmcs->guest_pdptr0;
8836 vmcs12->guest_pdptr1 = evmcs->guest_pdptr1;
8837 vmcs12->guest_pdptr2 = evmcs->guest_pdptr2;
8838 vmcs12->guest_pdptr3 = evmcs->guest_pdptr3;
8839 vmcs12->guest_pending_dbg_exceptions =
8840 evmcs->guest_pending_dbg_exceptions;
8841 vmcs12->guest_sysenter_esp = evmcs->guest_sysenter_esp;
8842 vmcs12->guest_sysenter_eip = evmcs->guest_sysenter_eip;
8843 vmcs12->guest_bndcfgs = evmcs->guest_bndcfgs;
8844 vmcs12->guest_activity_state = evmcs->guest_activity_state;
8845 vmcs12->guest_sysenter_cs = evmcs->guest_sysenter_cs;
8846 }
8847
8848 /*
8849 * Not used?
8850 * vmcs12->vm_exit_msr_store_addr = evmcs->vm_exit_msr_store_addr;
8851 * vmcs12->vm_exit_msr_load_addr = evmcs->vm_exit_msr_load_addr;
8852 * vmcs12->vm_entry_msr_load_addr = evmcs->vm_entry_msr_load_addr;
8853 * vmcs12->cr3_target_value0 = evmcs->cr3_target_value0;
8854 * vmcs12->cr3_target_value1 = evmcs->cr3_target_value1;
8855 * vmcs12->cr3_target_value2 = evmcs->cr3_target_value2;
8856 * vmcs12->cr3_target_value3 = evmcs->cr3_target_value3;
8857 * vmcs12->page_fault_error_code_mask =
8858 * evmcs->page_fault_error_code_mask;
8859 * vmcs12->page_fault_error_code_match =
8860 * evmcs->page_fault_error_code_match;
8861 * vmcs12->cr3_target_count = evmcs->cr3_target_count;
8862 * vmcs12->vm_exit_msr_store_count = evmcs->vm_exit_msr_store_count;
8863 * vmcs12->vm_exit_msr_load_count = evmcs->vm_exit_msr_load_count;
8864 * vmcs12->vm_entry_msr_load_count = evmcs->vm_entry_msr_load_count;
8865 */
8866
8867 /*
8868 * Read only fields:
8869 * vmcs12->guest_physical_address = evmcs->guest_physical_address;
8870 * vmcs12->vm_instruction_error = evmcs->vm_instruction_error;
8871 * vmcs12->vm_exit_reason = evmcs->vm_exit_reason;
8872 * vmcs12->vm_exit_intr_info = evmcs->vm_exit_intr_info;
8873 * vmcs12->vm_exit_intr_error_code = evmcs->vm_exit_intr_error_code;
8874 * vmcs12->idt_vectoring_info_field = evmcs->idt_vectoring_info_field;
8875 * vmcs12->idt_vectoring_error_code = evmcs->idt_vectoring_error_code;
8876 * vmcs12->vm_exit_instruction_len = evmcs->vm_exit_instruction_len;
8877 * vmcs12->vmx_instruction_info = evmcs->vmx_instruction_info;
8878 * vmcs12->exit_qualification = evmcs->exit_qualification;
8879 * vmcs12->guest_linear_address = evmcs->guest_linear_address;
8880 *
8881 * Not present in struct vmcs12:
8882 * vmcs12->exit_io_instruction_ecx = evmcs->exit_io_instruction_ecx;
8883 * vmcs12->exit_io_instruction_esi = evmcs->exit_io_instruction_esi;
8884 * vmcs12->exit_io_instruction_edi = evmcs->exit_io_instruction_edi;
8885 * vmcs12->exit_io_instruction_eip = evmcs->exit_io_instruction_eip;
8886 */
8887
8888 return 0;
8889}
8890
8891static int copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx)
8892{
8893 struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12;
8894 struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs;
8895
8896 /*
8897 * Should not be changed by KVM:
8898 *
8899 * evmcs->host_es_selector = vmcs12->host_es_selector;
8900 * evmcs->host_cs_selector = vmcs12->host_cs_selector;
8901 * evmcs->host_ss_selector = vmcs12->host_ss_selector;
8902 * evmcs->host_ds_selector = vmcs12->host_ds_selector;
8903 * evmcs->host_fs_selector = vmcs12->host_fs_selector;
8904 * evmcs->host_gs_selector = vmcs12->host_gs_selector;
8905 * evmcs->host_tr_selector = vmcs12->host_tr_selector;
8906 * evmcs->host_ia32_pat = vmcs12->host_ia32_pat;
8907 * evmcs->host_ia32_efer = vmcs12->host_ia32_efer;
8908 * evmcs->host_cr0 = vmcs12->host_cr0;
8909 * evmcs->host_cr3 = vmcs12->host_cr3;
8910 * evmcs->host_cr4 = vmcs12->host_cr4;
8911 * evmcs->host_ia32_sysenter_esp = vmcs12->host_ia32_sysenter_esp;
8912 * evmcs->host_ia32_sysenter_eip = vmcs12->host_ia32_sysenter_eip;
8913 * evmcs->host_rip = vmcs12->host_rip;
8914 * evmcs->host_ia32_sysenter_cs = vmcs12->host_ia32_sysenter_cs;
8915 * evmcs->host_fs_base = vmcs12->host_fs_base;
8916 * evmcs->host_gs_base = vmcs12->host_gs_base;
8917 * evmcs->host_tr_base = vmcs12->host_tr_base;
8918 * evmcs->host_gdtr_base = vmcs12->host_gdtr_base;
8919 * evmcs->host_idtr_base = vmcs12->host_idtr_base;
8920 * evmcs->host_rsp = vmcs12->host_rsp;
8921 * sync_vmcs12() doesn't read these:
8922 * evmcs->io_bitmap_a = vmcs12->io_bitmap_a;
8923 * evmcs->io_bitmap_b = vmcs12->io_bitmap_b;
8924 * evmcs->msr_bitmap = vmcs12->msr_bitmap;
8925 * evmcs->ept_pointer = vmcs12->ept_pointer;
8926 * evmcs->xss_exit_bitmap = vmcs12->xss_exit_bitmap;
8927 * evmcs->vm_exit_msr_store_addr = vmcs12->vm_exit_msr_store_addr;
8928 * evmcs->vm_exit_msr_load_addr = vmcs12->vm_exit_msr_load_addr;
8929 * evmcs->vm_entry_msr_load_addr = vmcs12->vm_entry_msr_load_addr;
8930 * evmcs->cr3_target_value0 = vmcs12->cr3_target_value0;
8931 * evmcs->cr3_target_value1 = vmcs12->cr3_target_value1;
8932 * evmcs->cr3_target_value2 = vmcs12->cr3_target_value2;
8933 * evmcs->cr3_target_value3 = vmcs12->cr3_target_value3;
8934 * evmcs->tpr_threshold = vmcs12->tpr_threshold;
8935 * evmcs->virtual_processor_id = vmcs12->virtual_processor_id;
8936 * evmcs->exception_bitmap = vmcs12->exception_bitmap;
8937 * evmcs->vmcs_link_pointer = vmcs12->vmcs_link_pointer;
8938 * evmcs->pin_based_vm_exec_control = vmcs12->pin_based_vm_exec_control;
8939 * evmcs->vm_exit_controls = vmcs12->vm_exit_controls;
8940 * evmcs->secondary_vm_exec_control = vmcs12->secondary_vm_exec_control;
8941 * evmcs->page_fault_error_code_mask =
8942 * vmcs12->page_fault_error_code_mask;
8943 * evmcs->page_fault_error_code_match =
8944 * vmcs12->page_fault_error_code_match;
8945 * evmcs->cr3_target_count = vmcs12->cr3_target_count;
8946 * evmcs->virtual_apic_page_addr = vmcs12->virtual_apic_page_addr;
8947 * evmcs->tsc_offset = vmcs12->tsc_offset;
8948 * evmcs->guest_ia32_debugctl = vmcs12->guest_ia32_debugctl;
8949 * evmcs->cr0_guest_host_mask = vmcs12->cr0_guest_host_mask;
8950 * evmcs->cr4_guest_host_mask = vmcs12->cr4_guest_host_mask;
8951 * evmcs->cr0_read_shadow = vmcs12->cr0_read_shadow;
8952 * evmcs->cr4_read_shadow = vmcs12->cr4_read_shadow;
8953 * evmcs->vm_exit_msr_store_count = vmcs12->vm_exit_msr_store_count;
8954 * evmcs->vm_exit_msr_load_count = vmcs12->vm_exit_msr_load_count;
8955 * evmcs->vm_entry_msr_load_count = vmcs12->vm_entry_msr_load_count;
8956 *
8957 * Not present in struct vmcs12:
8958 * evmcs->exit_io_instruction_ecx = vmcs12->exit_io_instruction_ecx;
8959 * evmcs->exit_io_instruction_esi = vmcs12->exit_io_instruction_esi;
8960 * evmcs->exit_io_instruction_edi = vmcs12->exit_io_instruction_edi;
8961 * evmcs->exit_io_instruction_eip = vmcs12->exit_io_instruction_eip;
8962 */
8963
8964 evmcs->guest_es_selector = vmcs12->guest_es_selector;
8965 evmcs->guest_cs_selector = vmcs12->guest_cs_selector;
8966 evmcs->guest_ss_selector = vmcs12->guest_ss_selector;
8967 evmcs->guest_ds_selector = vmcs12->guest_ds_selector;
8968 evmcs->guest_fs_selector = vmcs12->guest_fs_selector;
8969 evmcs->guest_gs_selector = vmcs12->guest_gs_selector;
8970 evmcs->guest_ldtr_selector = vmcs12->guest_ldtr_selector;
8971 evmcs->guest_tr_selector = vmcs12->guest_tr_selector;
8972
8973 evmcs->guest_es_limit = vmcs12->guest_es_limit;
8974 evmcs->guest_cs_limit = vmcs12->guest_cs_limit;
8975 evmcs->guest_ss_limit = vmcs12->guest_ss_limit;
8976 evmcs->guest_ds_limit = vmcs12->guest_ds_limit;
8977 evmcs->guest_fs_limit = vmcs12->guest_fs_limit;
8978 evmcs->guest_gs_limit = vmcs12->guest_gs_limit;
8979 evmcs->guest_ldtr_limit = vmcs12->guest_ldtr_limit;
8980 evmcs->guest_tr_limit = vmcs12->guest_tr_limit;
8981 evmcs->guest_gdtr_limit = vmcs12->guest_gdtr_limit;
8982 evmcs->guest_idtr_limit = vmcs12->guest_idtr_limit;
8983
8984 evmcs->guest_es_ar_bytes = vmcs12->guest_es_ar_bytes;
8985 evmcs->guest_cs_ar_bytes = vmcs12->guest_cs_ar_bytes;
8986 evmcs->guest_ss_ar_bytes = vmcs12->guest_ss_ar_bytes;
8987 evmcs->guest_ds_ar_bytes = vmcs12->guest_ds_ar_bytes;
8988 evmcs->guest_fs_ar_bytes = vmcs12->guest_fs_ar_bytes;
8989 evmcs->guest_gs_ar_bytes = vmcs12->guest_gs_ar_bytes;
8990 evmcs->guest_ldtr_ar_bytes = vmcs12->guest_ldtr_ar_bytes;
8991 evmcs->guest_tr_ar_bytes = vmcs12->guest_tr_ar_bytes;
8992
8993 evmcs->guest_es_base = vmcs12->guest_es_base;
8994 evmcs->guest_cs_base = vmcs12->guest_cs_base;
8995 evmcs->guest_ss_base = vmcs12->guest_ss_base;
8996 evmcs->guest_ds_base = vmcs12->guest_ds_base;
8997 evmcs->guest_fs_base = vmcs12->guest_fs_base;
8998 evmcs->guest_gs_base = vmcs12->guest_gs_base;
8999 evmcs->guest_ldtr_base = vmcs12->guest_ldtr_base;
9000 evmcs->guest_tr_base = vmcs12->guest_tr_base;
9001 evmcs->guest_gdtr_base = vmcs12->guest_gdtr_base;
9002 evmcs->guest_idtr_base = vmcs12->guest_idtr_base;
9003
9004 evmcs->guest_ia32_pat = vmcs12->guest_ia32_pat;
9005 evmcs->guest_ia32_efer = vmcs12->guest_ia32_efer;
9006
9007 evmcs->guest_pdptr0 = vmcs12->guest_pdptr0;
9008 evmcs->guest_pdptr1 = vmcs12->guest_pdptr1;
9009 evmcs->guest_pdptr2 = vmcs12->guest_pdptr2;
9010 evmcs->guest_pdptr3 = vmcs12->guest_pdptr3;
9011
9012 evmcs->guest_pending_dbg_exceptions =
9013 vmcs12->guest_pending_dbg_exceptions;
9014 evmcs->guest_sysenter_esp = vmcs12->guest_sysenter_esp;
9015 evmcs->guest_sysenter_eip = vmcs12->guest_sysenter_eip;
9016
9017 evmcs->guest_activity_state = vmcs12->guest_activity_state;
9018 evmcs->guest_sysenter_cs = vmcs12->guest_sysenter_cs;
9019
9020 evmcs->guest_cr0 = vmcs12->guest_cr0;
9021 evmcs->guest_cr3 = vmcs12->guest_cr3;
9022 evmcs->guest_cr4 = vmcs12->guest_cr4;
9023 evmcs->guest_dr7 = vmcs12->guest_dr7;
9024
9025 evmcs->guest_physical_address = vmcs12->guest_physical_address;
9026
9027 evmcs->vm_instruction_error = vmcs12->vm_instruction_error;
9028 evmcs->vm_exit_reason = vmcs12->vm_exit_reason;
9029 evmcs->vm_exit_intr_info = vmcs12->vm_exit_intr_info;
9030 evmcs->vm_exit_intr_error_code = vmcs12->vm_exit_intr_error_code;
9031 evmcs->idt_vectoring_info_field = vmcs12->idt_vectoring_info_field;
9032 evmcs->idt_vectoring_error_code = vmcs12->idt_vectoring_error_code;
9033 evmcs->vm_exit_instruction_len = vmcs12->vm_exit_instruction_len;
9034 evmcs->vmx_instruction_info = vmcs12->vmx_instruction_info;
9035
9036 evmcs->exit_qualification = vmcs12->exit_qualification;
9037
9038 evmcs->guest_linear_address = vmcs12->guest_linear_address;
9039 evmcs->guest_rsp = vmcs12->guest_rsp;
9040 evmcs->guest_rflags = vmcs12->guest_rflags;
9041
9042 evmcs->guest_interruptibility_info =
9043 vmcs12->guest_interruptibility_info;
9044 evmcs->cpu_based_vm_exec_control = vmcs12->cpu_based_vm_exec_control;
9045 evmcs->vm_entry_controls = vmcs12->vm_entry_controls;
9046 evmcs->vm_entry_intr_info_field = vmcs12->vm_entry_intr_info_field;
9047 evmcs->vm_entry_exception_error_code =
9048 vmcs12->vm_entry_exception_error_code;
9049 evmcs->vm_entry_instruction_len = vmcs12->vm_entry_instruction_len;
9050
9051 evmcs->guest_rip = vmcs12->guest_rip;
9052
9053 evmcs->guest_bndcfgs = vmcs12->guest_bndcfgs;
9054
9055 return 0;
9056}
9057
9058/*
9059 * Copy the writable VMCS shadow fields back to the VMCS12, in case
9060 * they have been modified by the L1 guest. Note that the "read-only"
9061 * VM-exit information fields are actually writable if the vCPU is
9062 * configured to support "VMWRITE to any supported field in the VMCS."
9063 */
9064static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)
9065{
9066 const u16 *fields[] = {
9067 shadow_read_write_fields,
9068 shadow_read_only_fields
9069 };
9070 const int max_fields[] = {
9071 max_shadow_read_write_fields,
9072 max_shadow_read_only_fields
9073 };
9074 int i, q;
9075 unsigned long field;
9076 u64 field_value;
9077 struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs;
9078
9079 preempt_disable();
9080
9081 vmcs_load(shadow_vmcs);
9082
9083 for (q = 0; q < ARRAY_SIZE(fields); q++) {
9084 for (i = 0; i < max_fields[q]; i++) {
9085 field = fields[q][i];
9086 field_value = __vmcs_readl(field);
9087 vmcs12_write_any(get_vmcs12(&vmx->vcpu), field, field_value);
9088 }
9089 /*
9090 * Skip the VM-exit information fields if they are read-only.
9091 */
9092 if (!nested_cpu_has_vmwrite_any_field(&vmx->vcpu))
9093 break;
9094 }
9095
9096 vmcs_clear(shadow_vmcs);
9097 vmcs_load(vmx->loaded_vmcs->vmcs);
9098
9099 preempt_enable();
9100}
9101
9102static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
9103{
9104 const u16 *fields[] = {
9105 shadow_read_write_fields,
9106 shadow_read_only_fields
9107 };
9108 const int max_fields[] = {
9109 max_shadow_read_write_fields,
9110 max_shadow_read_only_fields
9111 };
9112 int i, q;
9113 unsigned long field;
9114 u64 field_value = 0;
9115 struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs;
9116
9117 vmcs_load(shadow_vmcs);
9118
9119 for (q = 0; q < ARRAY_SIZE(fields); q++) {
9120 for (i = 0; i < max_fields[q]; i++) {
9121 field = fields[q][i];
9122 vmcs12_read_any(get_vmcs12(&vmx->vcpu), field, &field_value);
9123 __vmcs_writel(field, field_value);
9124 }
9125 }
9126
9127 vmcs_clear(shadow_vmcs);
9128 vmcs_load(vmx->loaded_vmcs->vmcs);
9129}
9130
9131static int handle_vmread(struct kvm_vcpu *vcpu)
9132{
9133 unsigned long field;
9134 u64 field_value;
9135 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
9136 u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
9137 gva_t gva = 0;
9138 struct vmcs12 *vmcs12;
9139
9140 if (!nested_vmx_check_permission(vcpu))
9141 return 1;
9142
9143 if (to_vmx(vcpu)->nested.current_vmptr == -1ull)
9144 return nested_vmx_failInvalid(vcpu);
9145
9146 if (!is_guest_mode(vcpu))
9147 vmcs12 = get_vmcs12(vcpu);
9148 else {
9149 /*
9150 * When vmcs->vmcs_link_pointer is -1ull, any VMREAD
9151 * to shadowed-field sets the ALU flags for VMfailInvalid.
9152 */
9153 if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull)
9154 return nested_vmx_failInvalid(vcpu);
9155 vmcs12 = get_shadow_vmcs12(vcpu);
9156 }
9157
9158 /* Decode instruction info and find the field to read */
9159 field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
9160 /* Read the field, zero-extended to a u64 field_value */
9161 if (vmcs12_read_any(vmcs12, field, &field_value) < 0)
9162 return nested_vmx_failValid(vcpu,
9163 VMXERR_UNSUPPORTED_VMCS_COMPONENT);
9164
9165 /*
9166 * Now copy part of this value to register or memory, as requested.
9167 * Note that the number of bits actually copied is 32 or 64 depending
9168 * on the guest's mode (32 or 64 bit), not on the given field's length.
9169 */
9170 if (vmx_instruction_info & (1u << 10)) {
9171 kvm_register_writel(vcpu, (((vmx_instruction_info) >> 3) & 0xf),
9172 field_value);
9173 } else {
9174 if (get_vmx_mem_address(vcpu, exit_qualification,
9175 vmx_instruction_info, true, &gva))
9176 return 1;
9177 /* _system ok, nested_vmx_check_permission has verified cpl=0 */
9178 kvm_write_guest_virt_system(vcpu, gva, &field_value,
9179 (is_long_mode(vcpu) ? 8 : 4), NULL);
9180 }
9181
9182 return nested_vmx_succeed(vcpu);
9183}
9184
9185
9186static int handle_vmwrite(struct kvm_vcpu *vcpu)
9187{
9188 unsigned long field;
9189 gva_t gva;
9190 struct vcpu_vmx *vmx = to_vmx(vcpu);
9191 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
9192 u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
9193
9194 /* The value to write might be 32 or 64 bits, depending on L1's long
9195 * mode, and eventually we need to write that into a field of several
9196 * possible lengths. The code below first zero-extends the value to 64
9197 * bit (field_value), and then copies only the appropriate number of
9198 * bits into the vmcs12 field.
9199 */
9200 u64 field_value = 0;
9201 struct x86_exception e;
9202 struct vmcs12 *vmcs12;
9203
9204 if (!nested_vmx_check_permission(vcpu))
9205 return 1;
9206
9207 if (vmx->nested.current_vmptr == -1ull)
9208 return nested_vmx_failInvalid(vcpu);
9209
9210 if (vmx_instruction_info & (1u << 10))
9211 field_value = kvm_register_readl(vcpu,
9212 (((vmx_instruction_info) >> 3) & 0xf));
9213 else {
9214 if (get_vmx_mem_address(vcpu, exit_qualification,
9215 vmx_instruction_info, false, &gva))
9216 return 1;
9217 if (kvm_read_guest_virt(vcpu, gva, &field_value,
9218 (is_64_bit_mode(vcpu) ? 8 : 4), &e)) {
9219 kvm_inject_page_fault(vcpu, &e);
9220 return 1;
9221 }
9222 }
9223
9224
9225 field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
9226 /*
9227 * If the vCPU supports "VMWRITE to any supported field in the
9228 * VMCS," then the "read-only" fields are actually read/write.
9229 */
9230 if (vmcs_field_readonly(field) &&
9231 !nested_cpu_has_vmwrite_any_field(vcpu))
9232 return nested_vmx_failValid(vcpu,
9233 VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT);
9234
9235 if (!is_guest_mode(vcpu))
9236 vmcs12 = get_vmcs12(vcpu);
9237 else {
9238 /*
9239 * When vmcs->vmcs_link_pointer is -1ull, any VMWRITE
9240 * to shadowed-field sets the ALU flags for VMfailInvalid.
9241 */
9242 if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull)
9243 return nested_vmx_failInvalid(vcpu);
9244 vmcs12 = get_shadow_vmcs12(vcpu);
9245 }
9246
9247 if (vmcs12_write_any(vmcs12, field, field_value) < 0)
9248 return nested_vmx_failValid(vcpu,
9249 VMXERR_UNSUPPORTED_VMCS_COMPONENT);
9250
9251 /*
9252 * Do not track vmcs12 dirty-state if in guest-mode
9253 * as we actually dirty shadow vmcs12 instead of vmcs12.
9254 */
9255 if (!is_guest_mode(vcpu)) {
9256 switch (field) {
9257#define SHADOW_FIELD_RW(x) case x:
9258#include "vmx_shadow_fields.h"
9259 /*
9260 * The fields that can be updated by L1 without a vmexit are
9261 * always updated in the vmcs02, the others go down the slow
9262 * path of prepare_vmcs02.
9263 */
9264 break;
9265 default:
9266 vmx->nested.dirty_vmcs12 = true;
9267 break;
9268 }
9269 }
9270
9271 return nested_vmx_succeed(vcpu);
9272}
9273
9274static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr)
9275{
9276 vmx->nested.current_vmptr = vmptr;
9277 if (enable_shadow_vmcs) {
9278 vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
9279 SECONDARY_EXEC_SHADOW_VMCS);
9280 vmcs_write64(VMCS_LINK_POINTER,
9281 __pa(vmx->vmcs01.shadow_vmcs));
9282 vmx->nested.need_vmcs12_sync = true;
9283 }
9284 vmx->nested.dirty_vmcs12 = true;
9285}
9286
9287/* Emulate the VMPTRLD instruction */
9288static int handle_vmptrld(struct kvm_vcpu *vcpu)
9289{
9290 struct vcpu_vmx *vmx = to_vmx(vcpu);
9291 gpa_t vmptr;
9292
9293 if (!nested_vmx_check_permission(vcpu))
9294 return 1;
9295
9296 if (nested_vmx_get_vmptr(vcpu, &vmptr))
9297 return 1;
9298
9299 if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu)))
9300 return nested_vmx_failValid(vcpu,
9301 VMXERR_VMPTRLD_INVALID_ADDRESS);
9302
9303 if (vmptr == vmx->nested.vmxon_ptr)
9304 return nested_vmx_failValid(vcpu,
9305 VMXERR_VMPTRLD_VMXON_POINTER);
9306
9307 /* Forbid normal VMPTRLD if Enlightened version was used */
9308 if (vmx->nested.hv_evmcs)
9309 return 1;
9310
9311 if (vmx->nested.current_vmptr != vmptr) {
9312 struct vmcs12 *new_vmcs12;
9313 struct page *page;
9314 page = kvm_vcpu_gpa_to_page(vcpu, vmptr);
9315 if (is_error_page(page))
9316 return nested_vmx_failInvalid(vcpu);
9317
9318 new_vmcs12 = kmap(page);
9319 if (new_vmcs12->hdr.revision_id != VMCS12_REVISION ||
9320 (new_vmcs12->hdr.shadow_vmcs &&
9321 !nested_cpu_has_vmx_shadow_vmcs(vcpu))) {
9322 kunmap(page);
9323 kvm_release_page_clean(page);
9324 return nested_vmx_failValid(vcpu,
9325 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
9326 }
9327
9328 nested_release_vmcs12(vcpu);
9329
9330 /*
9331 * Load VMCS12 from guest memory since it is not already
9332 * cached.
9333 */
9334 memcpy(vmx->nested.cached_vmcs12, new_vmcs12, VMCS12_SIZE);
9335 kunmap(page);
9336 kvm_release_page_clean(page);
9337
9338 set_current_vmptr(vmx, vmptr);
9339 }
9340
9341 return nested_vmx_succeed(vcpu);
9342}
9343
9344/*
9345 * This is an equivalent of the nested hypervisor executing the vmptrld
9346 * instruction.
9347 */
9348static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu,
9349 bool from_launch)
9350{
9351 struct vcpu_vmx *vmx = to_vmx(vcpu);
9352 struct hv_vp_assist_page assist_page;
9353
9354 if (likely(!vmx->nested.enlightened_vmcs_enabled))
9355 return 1;
9356
9357 if (unlikely(!kvm_hv_get_assist_page(vcpu, &assist_page)))
9358 return 1;
9359
9360 if (unlikely(!assist_page.enlighten_vmentry))
9361 return 1;
9362
9363 if (unlikely(assist_page.current_nested_vmcs !=
9364 vmx->nested.hv_evmcs_vmptr)) {
9365
9366 if (!vmx->nested.hv_evmcs)
9367 vmx->nested.current_vmptr = -1ull;
9368
9369 nested_release_evmcs(vcpu);
9370
9371 vmx->nested.hv_evmcs_page = kvm_vcpu_gpa_to_page(
9372 vcpu, assist_page.current_nested_vmcs);
9373
9374 if (unlikely(is_error_page(vmx->nested.hv_evmcs_page)))
9375 return 0;
9376
9377 vmx->nested.hv_evmcs = kmap(vmx->nested.hv_evmcs_page);
9378
9379 /*
9380 * Currently, KVM only supports eVMCS version 1
9381 * (== KVM_EVMCS_VERSION) and thus we expect guest to set this
9382 * value to first u32 field of eVMCS which should specify eVMCS
9383 * VersionNumber.
9384 *
9385 * Guest should be aware of supported eVMCS versions by host by
9386 * examining CPUID.0x4000000A.EAX[0:15]. Host userspace VMM is
9387 * expected to set this CPUID leaf according to the value
9388 * returned in vmcs_version from nested_enable_evmcs().
9389 *
9390 * However, it turns out that Microsoft Hyper-V fails to comply
9391 * to their own invented interface: When Hyper-V use eVMCS, it
9392 * just sets first u32 field of eVMCS to revision_id specified
9393 * in MSR_IA32_VMX_BASIC. Instead of used eVMCS version number
9394 * which is one of the supported versions specified in
9395 * CPUID.0x4000000A.EAX[0:15].
9396 *
9397 * To overcome Hyper-V bug, we accept here either a supported
9398 * eVMCS version or VMCS12 revision_id as valid values for first
9399 * u32 field of eVMCS.
9400 */
9401 if ((vmx->nested.hv_evmcs->revision_id != KVM_EVMCS_VERSION) &&
9402 (vmx->nested.hv_evmcs->revision_id != VMCS12_REVISION)) {
9403 nested_release_evmcs(vcpu);
9404 return 0;
9405 }
9406
9407 vmx->nested.dirty_vmcs12 = true;
9408 /*
9409 * As we keep L2 state for one guest only 'hv_clean_fields' mask
9410 * can't be used when we switch between them. Reset it here for
9411 * simplicity.
9412 */
9413 vmx->nested.hv_evmcs->hv_clean_fields &=
9414 ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
9415 vmx->nested.hv_evmcs_vmptr = assist_page.current_nested_vmcs;
9416
9417 /*
9418 * Unlike normal vmcs12, enlightened vmcs12 is not fully
9419 * reloaded from guest's memory (read only fields, fields not
9420 * present in struct hv_enlightened_vmcs, ...). Make sure there
9421 * are no leftovers.
9422 */
9423 if (from_launch) {
9424 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
9425 memset(vmcs12, 0, sizeof(*vmcs12));
9426 vmcs12->hdr.revision_id = VMCS12_REVISION;
9427 }
9428
9429 }
9430 return 1;
9431}
9432
9433/* Emulate the VMPTRST instruction */
9434static int handle_vmptrst(struct kvm_vcpu *vcpu)
9435{
9436 unsigned long exit_qual = vmcs_readl(EXIT_QUALIFICATION);
9437 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO);
9438 gpa_t current_vmptr = to_vmx(vcpu)->nested.current_vmptr;
9439 struct x86_exception e;
9440 gva_t gva;
9441
9442 if (!nested_vmx_check_permission(vcpu))
9443 return 1;
9444
9445 if (unlikely(to_vmx(vcpu)->nested.hv_evmcs))
9446 return 1;
9447
9448 if (get_vmx_mem_address(vcpu, exit_qual, instr_info, true, &gva))
9449 return 1;
9450 /* *_system ok, nested_vmx_check_permission has verified cpl=0 */
9451 if (kvm_write_guest_virt_system(vcpu, gva, (void *)&current_vmptr,
9452 sizeof(gpa_t), &e)) {
9453 kvm_inject_page_fault(vcpu, &e);
9454 return 1;
9455 }
9456 return nested_vmx_succeed(vcpu);
9457}
9458
9459/* Emulate the INVEPT instruction */
9460static int handle_invept(struct kvm_vcpu *vcpu)
9461{
9462 struct vcpu_vmx *vmx = to_vmx(vcpu);
9463 u32 vmx_instruction_info, types;
9464 unsigned long type;
9465 gva_t gva;
9466 struct x86_exception e;
9467 struct {
9468 u64 eptp, gpa;
9469 } operand;
9470
9471 if (!(vmx->nested.msrs.secondary_ctls_high &
9472 SECONDARY_EXEC_ENABLE_EPT) ||
9473 !(vmx->nested.msrs.ept_caps & VMX_EPT_INVEPT_BIT)) {
9474 kvm_queue_exception(vcpu, UD_VECTOR);
9475 return 1;
9476 }
9477
9478 if (!nested_vmx_check_permission(vcpu))
9479 return 1;
9480
9481 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
9482 type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
9483
9484 types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6;
9485
9486 if (type >= 32 || !(types & (1 << type)))
9487 return nested_vmx_failValid(vcpu,
9488 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
9489
9490 /* According to the Intel VMX instruction reference, the memory
9491 * operand is read even if it isn't needed (e.g., for type==global)
9492 */
9493 if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
9494 vmx_instruction_info, false, &gva))
9495 return 1;
9496 if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) {
9497 kvm_inject_page_fault(vcpu, &e);
9498 return 1;
9499 }
9500
9501 switch (type) {
9502 case VMX_EPT_EXTENT_GLOBAL:
9503 /*
9504 * TODO: track mappings and invalidate
9505 * single context requests appropriately
9506 */
9507 case VMX_EPT_EXTENT_CONTEXT:
9508 kvm_mmu_sync_roots(vcpu);
9509 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
9510 break;
9511 default:
9512 BUG_ON(1);
9513 break;
9514 }
9515
9516 return nested_vmx_succeed(vcpu);
9517}
9518
9519static u16 nested_get_vpid02(struct kvm_vcpu *vcpu)
9520{
9521 struct vcpu_vmx *vmx = to_vmx(vcpu);
9522
9523 return vmx->nested.vpid02 ? vmx->nested.vpid02 : vmx->vpid;
9524}
9525
9526static int handle_invvpid(struct kvm_vcpu *vcpu)
9527{
9528 struct vcpu_vmx *vmx = to_vmx(vcpu);
9529 u32 vmx_instruction_info;
9530 unsigned long type, types;
9531 gva_t gva;
9532 struct x86_exception e;
9533 struct {
9534 u64 vpid;
9535 u64 gla;
9536 } operand;
9537 u16 vpid02;
9538
9539 if (!(vmx->nested.msrs.secondary_ctls_high &
9540 SECONDARY_EXEC_ENABLE_VPID) ||
9541 !(vmx->nested.msrs.vpid_caps & VMX_VPID_INVVPID_BIT)) {
9542 kvm_queue_exception(vcpu, UD_VECTOR);
9543 return 1;
9544 }
9545
9546 if (!nested_vmx_check_permission(vcpu))
9547 return 1;
9548
9549 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
9550 type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
9551
9552 types = (vmx->nested.msrs.vpid_caps &
9553 VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8;
9554
9555 if (type >= 32 || !(types & (1 << type)))
9556 return nested_vmx_failValid(vcpu,
9557 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
9558
9559 /* according to the intel vmx instruction reference, the memory
9560 * operand is read even if it isn't needed (e.g., for type==global)
9561 */
9562 if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
9563 vmx_instruction_info, false, &gva))
9564 return 1;
9565 if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) {
9566 kvm_inject_page_fault(vcpu, &e);
9567 return 1;
9568 }
9569 if (operand.vpid >> 16)
9570 return nested_vmx_failValid(vcpu,
9571 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
9572
9573 vpid02 = nested_get_vpid02(vcpu);
9574 switch (type) {
9575 case VMX_VPID_EXTENT_INDIVIDUAL_ADDR:
9576 if (!operand.vpid ||
9577 is_noncanonical_address(operand.gla, vcpu))
9578 return nested_vmx_failValid(vcpu,
9579 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
9580 if (cpu_has_vmx_invvpid_individual_addr()) {
9581 __invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR,
9582 vpid02, operand.gla);
9583 } else
9584 __vmx_flush_tlb(vcpu, vpid02, false);
9585 break;
9586 case VMX_VPID_EXTENT_SINGLE_CONTEXT:
9587 case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL:
9588 if (!operand.vpid)
9589 return nested_vmx_failValid(vcpu,
9590 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
9591 __vmx_flush_tlb(vcpu, vpid02, false);
9592 break;
9593 case VMX_VPID_EXTENT_ALL_CONTEXT:
9594 __vmx_flush_tlb(vcpu, vpid02, false);
9595 break;
9596 default:
9597 WARN_ON_ONCE(1);
9598 return kvm_skip_emulated_instruction(vcpu);
9599 }
9600
9601 return nested_vmx_succeed(vcpu);
9602}
9603
9604static int handle_invpcid(struct kvm_vcpu *vcpu)
9605{
9606 u32 vmx_instruction_info;
9607 unsigned long type;
9608 bool pcid_enabled;
9609 gva_t gva;
9610 struct x86_exception e;
9611 unsigned i;
9612 unsigned long roots_to_free = 0;
9613 struct {
9614 u64 pcid;
9615 u64 gla;
9616 } operand;
9617
9618 if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) {
9619 kvm_queue_exception(vcpu, UD_VECTOR);
9620 return 1;
9621 }
9622
9623 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
9624 type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
9625
9626 if (type > 3) {
9627 kvm_inject_gp(vcpu, 0);
9628 return 1;
9629 }
9630
9631 /* According to the Intel instruction reference, the memory operand
9632 * is read even if it isn't needed (e.g., for type==all)
9633 */
9634 if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
9635 vmx_instruction_info, false, &gva))
9636 return 1;
9637
9638 if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) {
9639 kvm_inject_page_fault(vcpu, &e);
9640 return 1;
9641 }
9642
9643 if (operand.pcid >> 12 != 0) {
9644 kvm_inject_gp(vcpu, 0);
9645 return 1;
9646 }
9647
9648 pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE);
9649
9650 switch (type) {
9651 case INVPCID_TYPE_INDIV_ADDR:
9652 if ((!pcid_enabled && (operand.pcid != 0)) ||
9653 is_noncanonical_address(operand.gla, vcpu)) {
9654 kvm_inject_gp(vcpu, 0);
9655 return 1;
9656 }
9657 kvm_mmu_invpcid_gva(vcpu, operand.gla, operand.pcid);
9658 return kvm_skip_emulated_instruction(vcpu);
9659
9660 case INVPCID_TYPE_SINGLE_CTXT:
9661 if (!pcid_enabled && (operand.pcid != 0)) {
9662 kvm_inject_gp(vcpu, 0);
9663 return 1;
9664 }
9665
9666 if (kvm_get_active_pcid(vcpu) == operand.pcid) {
9667 kvm_mmu_sync_roots(vcpu);
9668 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
9669 }
9670
9671 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
9672 if (kvm_get_pcid(vcpu, vcpu->arch.mmu->prev_roots[i].cr3)
9673 == operand.pcid)
9674 roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
9675
9676 kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, roots_to_free);
9677 /*
9678 * If neither the current cr3 nor any of the prev_roots use the
9679 * given PCID, then nothing needs to be done here because a
9680 * resync will happen anyway before switching to any other CR3.
9681 */
9682
9683 return kvm_skip_emulated_instruction(vcpu);
9684
9685 case INVPCID_TYPE_ALL_NON_GLOBAL:
9686 /*
9687 * Currently, KVM doesn't mark global entries in the shadow
9688 * page tables, so a non-global flush just degenerates to a
9689 * global flush. If needed, we could optimize this later by
9690 * keeping track of global entries in shadow page tables.
9691 */
9692
9693 /* fall-through */
9694 case INVPCID_TYPE_ALL_INCL_GLOBAL:
9695 kvm_mmu_unload(vcpu);
9696 return kvm_skip_emulated_instruction(vcpu);
9697
9698 default:
9699 BUG(); /* We have already checked above that type <= 3 */
9700 }
9701}
9702
9703static int handle_pml_full(struct kvm_vcpu *vcpu)
9704{
9705 unsigned long exit_qualification;
9706
9707 trace_kvm_pml_full(vcpu->vcpu_id);
9708
9709 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
9710
9711 /*
9712 * PML buffer FULL happened while executing iret from NMI,
9713 * "blocked by NMI" bit has to be set before next VM entry.
9714 */
9715 if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
9716 enable_vnmi &&
9717 (exit_qualification & INTR_INFO_UNBLOCK_NMI))
9718 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
9719 GUEST_INTR_STATE_NMI);
9720
9721 /*
9722 * PML buffer already flushed at beginning of VMEXIT. Nothing to do
9723 * here.., and there's no userspace involvement needed for PML.
9724 */
9725 return 1;
9726}
9727
9728static int handle_preemption_timer(struct kvm_vcpu *vcpu)
9729{
9730 if (!to_vmx(vcpu)->req_immediate_exit)
9731 kvm_lapic_expired_hv_timer(vcpu);
9732 return 1;
9733}
9734
9735static bool valid_ept_address(struct kvm_vcpu *vcpu, u64 address)
9736{
9737 struct vcpu_vmx *vmx = to_vmx(vcpu);
9738 int maxphyaddr = cpuid_maxphyaddr(vcpu);
9739
9740 /* Check for memory type validity */
9741 switch (address & VMX_EPTP_MT_MASK) {
9742 case VMX_EPTP_MT_UC:
9743 if (!(vmx->nested.msrs.ept_caps & VMX_EPTP_UC_BIT))
9744 return false;
9745 break;
9746 case VMX_EPTP_MT_WB:
9747 if (!(vmx->nested.msrs.ept_caps & VMX_EPTP_WB_BIT))
9748 return false;
9749 break;
9750 default:
9751 return false;
9752 }
9753
9754 /* only 4 levels page-walk length are valid */
9755 if ((address & VMX_EPTP_PWL_MASK) != VMX_EPTP_PWL_4)
9756 return false;
9757
9758 /* Reserved bits should not be set */
9759 if (address >> maxphyaddr || ((address >> 7) & 0x1f))
9760 return false;
9761
9762 /* AD, if set, should be supported */
9763 if (address & VMX_EPTP_AD_ENABLE_BIT) {
9764 if (!(vmx->nested.msrs.ept_caps & VMX_EPT_AD_BIT))
9765 return false;
9766 }
9767
9768 return true;
9769}
9770
9771static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu,
9772 struct vmcs12 *vmcs12)
9773{
9774 u32 index = vcpu->arch.regs[VCPU_REGS_RCX];
9775 u64 address;
9776 bool accessed_dirty;
9777 struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
9778
9779 if (!nested_cpu_has_eptp_switching(vmcs12) ||
9780 !nested_cpu_has_ept(vmcs12))
9781 return 1;
9782
9783 if (index >= VMFUNC_EPTP_ENTRIES)
9784 return 1;
9785
9786
9787 if (kvm_vcpu_read_guest_page(vcpu, vmcs12->eptp_list_address >> PAGE_SHIFT,
9788 &address, index * 8, 8))
9789 return 1;
9790
9791 accessed_dirty = !!(address & VMX_EPTP_AD_ENABLE_BIT);
9792
9793 /*
9794 * If the (L2) guest does a vmfunc to the currently
9795 * active ept pointer, we don't have to do anything else
9796 */
9797 if (vmcs12->ept_pointer != address) {
9798 if (!valid_ept_address(vcpu, address))
9799 return 1;
9800
9801 kvm_mmu_unload(vcpu);
9802 mmu->ept_ad = accessed_dirty;
9803 mmu->mmu_role.base.ad_disabled = !accessed_dirty;
9804 vmcs12->ept_pointer = address;
9805 /*
9806 * TODO: Check what's the correct approach in case
9807 * mmu reload fails. Currently, we just let the next
9808 * reload potentially fail
9809 */
9810 kvm_mmu_reload(vcpu);
9811 }
9812
9813 return 0;
9814}
9815
9816static int handle_vmfunc(struct kvm_vcpu *vcpu)
9817{
9818 struct vcpu_vmx *vmx = to_vmx(vcpu);
9819 struct vmcs12 *vmcs12;
9820 u32 function = vcpu->arch.regs[VCPU_REGS_RAX];
9821
9822 /*
9823 * VMFUNC is only supported for nested guests, but we always enable the
9824 * secondary control for simplicity; for non-nested mode, fake that we
9825 * didn't by injecting #UD.
9826 */
9827 if (!is_guest_mode(vcpu)) {
9828 kvm_queue_exception(vcpu, UD_VECTOR);
9829 return 1;
9830 }
9831
9832 vmcs12 = get_vmcs12(vcpu);
9833 if ((vmcs12->vm_function_control & (1 << function)) == 0)
9834 goto fail;
9835
9836 switch (function) {
9837 case 0:
9838 if (nested_vmx_eptp_switching(vcpu, vmcs12))
9839 goto fail;
9840 break;
9841 default:
9842 goto fail;
9843 }
9844 return kvm_skip_emulated_instruction(vcpu);
9845
9846fail:
9847 nested_vmx_vmexit(vcpu, vmx->exit_reason,
9848 vmcs_read32(VM_EXIT_INTR_INFO),
9849 vmcs_readl(EXIT_QUALIFICATION));
9850 return 1;
9851}
9852
9853static int handle_encls(struct kvm_vcpu *vcpu)
9854{
9855 /*
9856 * SGX virtualization is not yet supported. There is no software
9857 * enable bit for SGX, so we have to trap ENCLS and inject a #UD
9858 * to prevent the guest from executing ENCLS.
9859 */
9860 kvm_queue_exception(vcpu, UD_VECTOR);
9861 return 1;
9862}
9863
9864/*
9865 * The exit handlers return 1 if the exit was handled fully and guest execution
9866 * may resume. Otherwise they set the kvm_run parameter to indicate what needs
9867 * to be done to userspace and return 0.
9868 */
9869static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
9870 [EXIT_REASON_EXCEPTION_NMI] = handle_exception,
9871 [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt,
9872 [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault,
9873 [EXIT_REASON_NMI_WINDOW] = handle_nmi_window,
9874 [EXIT_REASON_IO_INSTRUCTION] = handle_io,
9875 [EXIT_REASON_CR_ACCESS] = handle_cr,
9876 [EXIT_REASON_DR_ACCESS] = handle_dr,
9877 [EXIT_REASON_CPUID] = handle_cpuid,
9878 [EXIT_REASON_MSR_READ] = handle_rdmsr,
9879 [EXIT_REASON_MSR_WRITE] = handle_wrmsr,
9880 [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window,
9881 [EXIT_REASON_HLT] = handle_halt,
9882 [EXIT_REASON_INVD] = handle_invd,
9883 [EXIT_REASON_INVLPG] = handle_invlpg,
9884 [EXIT_REASON_RDPMC] = handle_rdpmc,
9885 [EXIT_REASON_VMCALL] = handle_vmcall,
9886 [EXIT_REASON_VMCLEAR] = handle_vmclear,
9887 [EXIT_REASON_VMLAUNCH] = handle_vmlaunch,
9888 [EXIT_REASON_VMPTRLD] = handle_vmptrld,
9889 [EXIT_REASON_VMPTRST] = handle_vmptrst,
9890 [EXIT_REASON_VMREAD] = handle_vmread,
9891 [EXIT_REASON_VMRESUME] = handle_vmresume,
9892 [EXIT_REASON_VMWRITE] = handle_vmwrite,
9893 [EXIT_REASON_VMOFF] = handle_vmoff,
9894 [EXIT_REASON_VMON] = handle_vmon,
9895 [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold,
9896 [EXIT_REASON_APIC_ACCESS] = handle_apic_access,
9897 [EXIT_REASON_APIC_WRITE] = handle_apic_write,
9898 [EXIT_REASON_EOI_INDUCED] = handle_apic_eoi_induced,
9899 [EXIT_REASON_WBINVD] = handle_wbinvd,
9900 [EXIT_REASON_XSETBV] = handle_xsetbv,
9901 [EXIT_REASON_TASK_SWITCH] = handle_task_switch,
9902 [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check,
9903 [EXIT_REASON_GDTR_IDTR] = handle_desc,
9904 [EXIT_REASON_LDTR_TR] = handle_desc,
9905 [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation,
9906 [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig,
9907 [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause,
9908 [EXIT_REASON_MWAIT_INSTRUCTION] = handle_mwait,
9909 [EXIT_REASON_MONITOR_TRAP_FLAG] = handle_monitor_trap,
9910 [EXIT_REASON_MONITOR_INSTRUCTION] = handle_monitor,
9911 [EXIT_REASON_INVEPT] = handle_invept,
9912 [EXIT_REASON_INVVPID] = handle_invvpid,
9913 [EXIT_REASON_RDRAND] = handle_invalid_op,
9914 [EXIT_REASON_RDSEED] = handle_invalid_op,
9915 [EXIT_REASON_XSAVES] = handle_xsaves,
9916 [EXIT_REASON_XRSTORS] = handle_xrstors,
9917 [EXIT_REASON_PML_FULL] = handle_pml_full,
9918 [EXIT_REASON_INVPCID] = handle_invpcid,
9919 [EXIT_REASON_VMFUNC] = handle_vmfunc,
9920 [EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer,
9921 [EXIT_REASON_ENCLS] = handle_encls,
9922};
9923
9924static const int kvm_vmx_max_exit_handlers =
9925 ARRAY_SIZE(kvm_vmx_exit_handlers);
9926
9927static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu,
9928 struct vmcs12 *vmcs12)
9929{
9930 unsigned long exit_qualification;
9931 gpa_t bitmap, last_bitmap;
9932 unsigned int port;
9933 int size;
9934 u8 b;
9935
9936 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
9937 return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING);
9938
9939 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
9940
9941 port = exit_qualification >> 16;
9942 size = (exit_qualification & 7) + 1;
9943
9944 last_bitmap = (gpa_t)-1;
9945 b = -1;
9946
9947 while (size > 0) {
9948 if (port < 0x8000)
9949 bitmap = vmcs12->io_bitmap_a;
9950 else if (port < 0x10000)
9951 bitmap = vmcs12->io_bitmap_b;
9952 else
9953 return true;
9954 bitmap += (port & 0x7fff) / 8;
9955
9956 if (last_bitmap != bitmap)
9957 if (kvm_vcpu_read_guest(vcpu, bitmap, &b, 1))
9958 return true;
9959 if (b & (1 << (port & 7)))
9960 return true;
9961
9962 port++;
9963 size--;
9964 last_bitmap = bitmap;
9965 }
9966
9967 return false;
9968}
9969
9970/*
9971 * Return 1 if we should exit from L2 to L1 to handle an MSR access access,
9972 * rather than handle it ourselves in L0. I.e., check whether L1 expressed
9973 * disinterest in the current event (read or write a specific MSR) by using an
9974 * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps.
9975 */
9976static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
9977 struct vmcs12 *vmcs12, u32 exit_reason)
9978{
9979 u32 msr_index = vcpu->arch.regs[VCPU_REGS_RCX];
9980 gpa_t bitmap;
9981
9982 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
9983 return true;
9984
9985 /*
9986 * The MSR_BITMAP page is divided into four 1024-byte bitmaps,
9987 * for the four combinations of read/write and low/high MSR numbers.
9988 * First we need to figure out which of the four to use:
9989 */
9990 bitmap = vmcs12->msr_bitmap;
9991 if (exit_reason == EXIT_REASON_MSR_WRITE)
9992 bitmap += 2048;
9993 if (msr_index >= 0xc0000000) {
9994 msr_index -= 0xc0000000;
9995 bitmap += 1024;
9996 }
9997
9998 /* Then read the msr_index'th bit from this bitmap: */
9999 if (msr_index < 1024*8) {
10000 unsigned char b;
10001 if (kvm_vcpu_read_guest(vcpu, bitmap + msr_index/8, &b, 1))
10002 return true;
10003 return 1 & (b >> (msr_index & 7));
10004 } else
10005 return true; /* let L1 handle the wrong parameter */
10006}
10007
10008/*
10009 * Return 1 if we should exit from L2 to L1 to handle a CR access exit,
10010 * rather than handle it ourselves in L0. I.e., check if L1 wanted to
10011 * intercept (via guest_host_mask etc.) the current event.
10012 */
10013static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
10014 struct vmcs12 *vmcs12)
10015{
10016 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
10017 int cr = exit_qualification & 15;
10018 int reg;
10019 unsigned long val;
10020
10021 switch ((exit_qualification >> 4) & 3) {
10022 case 0: /* mov to cr */
10023 reg = (exit_qualification >> 8) & 15;
10024 val = kvm_register_readl(vcpu, reg);
10025 switch (cr) {
10026 case 0:
10027 if (vmcs12->cr0_guest_host_mask &
10028 (val ^ vmcs12->cr0_read_shadow))
10029 return true;
10030 break;
10031 case 3:
10032 if ((vmcs12->cr3_target_count >= 1 &&
10033 vmcs12->cr3_target_value0 == val) ||
10034 (vmcs12->cr3_target_count >= 2 &&
10035 vmcs12->cr3_target_value1 == val) ||
10036 (vmcs12->cr3_target_count >= 3 &&
10037 vmcs12->cr3_target_value2 == val) ||
10038 (vmcs12->cr3_target_count >= 4 &&
10039 vmcs12->cr3_target_value3 == val))
10040 return false;
10041 if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING))
10042 return true;
10043 break;
10044 case 4:
10045 if (vmcs12->cr4_guest_host_mask &
10046 (vmcs12->cr4_read_shadow ^ val))
10047 return true;
10048 break;
10049 case 8:
10050 if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING))
10051 return true;
10052 break;
10053 }
10054 break;
10055 case 2: /* clts */
10056 if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) &&
10057 (vmcs12->cr0_read_shadow & X86_CR0_TS))
10058 return true;
10059 break;
10060 case 1: /* mov from cr */
10061 switch (cr) {
10062 case 3:
10063 if (vmcs12->cpu_based_vm_exec_control &
10064 CPU_BASED_CR3_STORE_EXITING)
10065 return true;
10066 break;
10067 case 8:
10068 if (vmcs12->cpu_based_vm_exec_control &
10069 CPU_BASED_CR8_STORE_EXITING)
10070 return true;
10071 break;
10072 }
10073 break;
10074 case 3: /* lmsw */
10075 /*
10076 * lmsw can change bits 1..3 of cr0, and only set bit 0 of
10077 * cr0. Other attempted changes are ignored, with no exit.
10078 */
10079 val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;
10080 if (vmcs12->cr0_guest_host_mask & 0xe &
10081 (val ^ vmcs12->cr0_read_shadow))
10082 return true;
10083 if ((vmcs12->cr0_guest_host_mask & 0x1) &&
10084 !(vmcs12->cr0_read_shadow & 0x1) &&
10085 (val & 0x1))
10086 return true;
10087 break;
10088 }
10089 return false;
10090}
10091
10092static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu,
10093 struct vmcs12 *vmcs12, gpa_t bitmap)
10094{
10095 u32 vmx_instruction_info;
10096 unsigned long field;
10097 u8 b;
10098
10099 if (!nested_cpu_has_shadow_vmcs(vmcs12))
10100 return true;
10101
10102 /* Decode instruction info and find the field to access */
10103 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
10104 field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
10105
10106 /* Out-of-range fields always cause a VM exit from L2 to L1 */
10107 if (field >> 15)
10108 return true;
10109
10110 if (kvm_vcpu_read_guest(vcpu, bitmap + field/8, &b, 1))
10111 return true;
10112
10113 return 1 & (b >> (field & 7));
10114}
10115
10116/*
10117 * Return 1 if we should exit from L2 to L1 to handle an exit, or 0 if we
10118 * should handle it ourselves in L0 (and then continue L2). Only call this
10119 * when in is_guest_mode (L2).
10120 */
10121static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
10122{
10123 u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
10124 struct vcpu_vmx *vmx = to_vmx(vcpu);
10125 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
10126
10127 if (vmx->nested.nested_run_pending)
10128 return false;
10129
10130 if (unlikely(vmx->fail)) {
10131 pr_info_ratelimited("%s failed vm entry %x\n", __func__,
10132 vmcs_read32(VM_INSTRUCTION_ERROR));
10133 return true;
10134 }
10135
10136 /*
10137 * The host physical addresses of some pages of guest memory
10138 * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC
10139 * Page). The CPU may write to these pages via their host
10140 * physical address while L2 is running, bypassing any
10141 * address-translation-based dirty tracking (e.g. EPT write
10142 * protection).
10143 *
10144 * Mark them dirty on every exit from L2 to prevent them from
10145 * getting out of sync with dirty tracking.
10146 */
10147 nested_mark_vmcs12_pages_dirty(vcpu);
10148
10149 trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason,
10150 vmcs_readl(EXIT_QUALIFICATION),
10151 vmx->idt_vectoring_info,
10152 intr_info,
10153 vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
10154 KVM_ISA_VMX);
10155
10156 switch (exit_reason) {
10157 case EXIT_REASON_EXCEPTION_NMI:
10158 if (is_nmi(intr_info))
10159 return false;
10160 else if (is_page_fault(intr_info))
10161 return !vmx->vcpu.arch.apf.host_apf_reason && enable_ept;
10162 else if (is_debug(intr_info) &&
10163 vcpu->guest_debug &
10164 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
10165 return false;
10166 else if (is_breakpoint(intr_info) &&
10167 vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
10168 return false;
10169 return vmcs12->exception_bitmap &
10170 (1u << (intr_info & INTR_INFO_VECTOR_MASK));
10171 case EXIT_REASON_EXTERNAL_INTERRUPT:
10172 return false;
10173 case EXIT_REASON_TRIPLE_FAULT:
10174 return true;
10175 case EXIT_REASON_PENDING_INTERRUPT:
10176 return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_INTR_PENDING);
10177 case EXIT_REASON_NMI_WINDOW:
10178 return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING);
10179 case EXIT_REASON_TASK_SWITCH:
10180 return true;
10181 case EXIT_REASON_CPUID:
10182 return true;
10183 case EXIT_REASON_HLT:
10184 return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING);
10185 case EXIT_REASON_INVD:
10186 return true;
10187 case EXIT_REASON_INVLPG:
10188 return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING);
10189 case EXIT_REASON_RDPMC:
10190 return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING);
10191 case EXIT_REASON_RDRAND:
10192 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND_EXITING);
10193 case EXIT_REASON_RDSEED:
10194 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING);
10195 case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP:
10196 return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING);
10197 case EXIT_REASON_VMREAD:
10198 return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12,
10199 vmcs12->vmread_bitmap);
10200 case EXIT_REASON_VMWRITE:
10201 return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12,
10202 vmcs12->vmwrite_bitmap);
10203 case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR:
10204 case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD:
10205 case EXIT_REASON_VMPTRST: case EXIT_REASON_VMRESUME:
10206 case EXIT_REASON_VMOFF: case EXIT_REASON_VMON:
10207 case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID:
10208 /*
10209 * VMX instructions trap unconditionally. This allows L1 to
10210 * emulate them for its L2 guest, i.e., allows 3-level nesting!
10211 */
10212 return true;
10213 case EXIT_REASON_CR_ACCESS:
10214 return nested_vmx_exit_handled_cr(vcpu, vmcs12);
10215 case EXIT_REASON_DR_ACCESS:
10216 return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING);
10217 case EXIT_REASON_IO_INSTRUCTION:
10218 return nested_vmx_exit_handled_io(vcpu, vmcs12);
10219 case EXIT_REASON_GDTR_IDTR: case EXIT_REASON_LDTR_TR:
10220 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC);
10221 case EXIT_REASON_MSR_READ:
10222 case EXIT_REASON_MSR_WRITE:
10223 return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason);
10224 case EXIT_REASON_INVALID_STATE:
10225 return true;
10226 case EXIT_REASON_MWAIT_INSTRUCTION:
10227 return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING);
10228 case EXIT_REASON_MONITOR_TRAP_FLAG:
10229 return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_TRAP_FLAG);
10230 case EXIT_REASON_MONITOR_INSTRUCTION:
10231 return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING);
10232 case EXIT_REASON_PAUSE_INSTRUCTION:
10233 return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) ||
10234 nested_cpu_has2(vmcs12,
10235 SECONDARY_EXEC_PAUSE_LOOP_EXITING);
10236 case EXIT_REASON_MCE_DURING_VMENTRY:
10237 return false;
10238 case EXIT_REASON_TPR_BELOW_THRESHOLD:
10239 return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW);
10240 case EXIT_REASON_APIC_ACCESS:
10241 case EXIT_REASON_APIC_WRITE:
10242 case EXIT_REASON_EOI_INDUCED:
10243 /*
10244 * The controls for "virtualize APIC accesses," "APIC-
10245 * register virtualization," and "virtual-interrupt
10246 * delivery" only come from vmcs12.
10247 */
10248 return true;
10249 case EXIT_REASON_EPT_VIOLATION:
10250 /*
10251 * L0 always deals with the EPT violation. If nested EPT is
10252 * used, and the nested mmu code discovers that the address is
10253 * missing in the guest EPT table (EPT12), the EPT violation
10254 * will be injected with nested_ept_inject_page_fault()
10255 */
10256 return false;
10257 case EXIT_REASON_EPT_MISCONFIG:
10258 /*
10259 * L2 never uses directly L1's EPT, but rather L0's own EPT
10260 * table (shadow on EPT) or a merged EPT table that L0 built
10261 * (EPT on EPT). So any problems with the structure of the
10262 * table is L0's fault.
10263 */
10264 return false;
10265 case EXIT_REASON_INVPCID:
10266 return
10267 nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_INVPCID) &&
10268 nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING);
10269 case EXIT_REASON_WBINVD:
10270 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING);
10271 case EXIT_REASON_XSETBV:
10272 return true;
10273 case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS:
10274 /*
10275 * This should never happen, since it is not possible to
10276 * set XSS to a non-zero value---neither in L1 nor in L2.
10277 * If if it were, XSS would have to be checked against
10278 * the XSS exit bitmap in vmcs12.
10279 */
10280 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES);
10281 case EXIT_REASON_PREEMPTION_TIMER:
10282 return false;
10283 case EXIT_REASON_PML_FULL:
10284 /* We emulate PML support to L1. */
10285 return false;
10286 case EXIT_REASON_VMFUNC:
10287 /* VM functions are emulated through L2->L0 vmexits. */
10288 return false;
10289 case EXIT_REASON_ENCLS:
10290 /* SGX is never exposed to L1 */
10291 return false;
10292 default:
10293 return true;
10294 }
10295}
10296
10297static int nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason)
10298{
10299 u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
10300
10301 /*
10302 * At this point, the exit interruption info in exit_intr_info
10303 * is only valid for EXCEPTION_NMI exits. For EXTERNAL_INTERRUPT
10304 * we need to query the in-kernel LAPIC.
10305 */
10306 WARN_ON(exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT);
10307 if ((exit_intr_info &
10308 (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) ==
10309 (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) {
10310 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
10311 vmcs12->vm_exit_intr_error_code =
10312 vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
10313 }
10314
10315 nested_vmx_vmexit(vcpu, exit_reason, exit_intr_info,
10316 vmcs_readl(EXIT_QUALIFICATION));
10317 return 1;
10318}
10319
10320static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
10321{
10322 *info1 = vmcs_readl(EXIT_QUALIFICATION);
10323 *info2 = vmcs_read32(VM_EXIT_INTR_INFO);
10324}
10325
10326static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx)
10327{
10328 if (vmx->pml_pg) {
10329 __free_page(vmx->pml_pg);
10330 vmx->pml_pg = NULL;
10331 }
10332}
10333
10334static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu)
10335{
10336 struct vcpu_vmx *vmx = to_vmx(vcpu);
10337 u64 *pml_buf;
10338 u16 pml_idx;
10339
10340 pml_idx = vmcs_read16(GUEST_PML_INDEX);
10341
10342 /* Do nothing if PML buffer is empty */
10343 if (pml_idx == (PML_ENTITY_NUM - 1))
10344 return;
10345
10346 /* PML index always points to next available PML buffer entity */
10347 if (pml_idx >= PML_ENTITY_NUM)
10348 pml_idx = 0;
10349 else
10350 pml_idx++;
10351
10352 pml_buf = page_address(vmx->pml_pg);
10353 for (; pml_idx < PML_ENTITY_NUM; pml_idx++) {
10354 u64 gpa;
10355
10356 gpa = pml_buf[pml_idx];
10357 WARN_ON(gpa & (PAGE_SIZE - 1));
10358 kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
10359 }
10360
10361 /* reset PML index */
10362 vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
10363}
10364
10365/*
10366 * Flush all vcpus' PML buffer and update logged GPAs to dirty_bitmap.
10367 * Called before reporting dirty_bitmap to userspace.
10368 */
10369static void kvm_flush_pml_buffers(struct kvm *kvm)
10370{
10371 int i;
10372 struct kvm_vcpu *vcpu;
10373 /*
10374 * We only need to kick vcpu out of guest mode here, as PML buffer
10375 * is flushed at beginning of all VMEXITs, and it's obvious that only
10376 * vcpus running in guest are possible to have unflushed GPAs in PML
10377 * buffer.
10378 */
10379 kvm_for_each_vcpu(i, vcpu, kvm)
10380 kvm_vcpu_kick(vcpu);
10381}
10382
10383static void vmx_dump_sel(char *name, uint32_t sel)
10384{
10385 pr_err("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016lx\n",
10386 name, vmcs_read16(sel),
10387 vmcs_read32(sel + GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR),
10388 vmcs_read32(sel + GUEST_ES_LIMIT - GUEST_ES_SELECTOR),
10389 vmcs_readl(sel + GUEST_ES_BASE - GUEST_ES_SELECTOR));
10390}
10391
10392static void vmx_dump_dtsel(char *name, uint32_t limit)
10393{
10394 pr_err("%s limit=0x%08x, base=0x%016lx\n",
10395 name, vmcs_read32(limit),
10396 vmcs_readl(limit + GUEST_GDTR_BASE - GUEST_GDTR_LIMIT));
10397}
10398
10399static void dump_vmcs(void)
10400{
10401 u32 vmentry_ctl = vmcs_read32(VM_ENTRY_CONTROLS);
10402 u32 vmexit_ctl = vmcs_read32(VM_EXIT_CONTROLS);
10403 u32 cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
10404 u32 pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL);
10405 u32 secondary_exec_control = 0;
10406 unsigned long cr4 = vmcs_readl(GUEST_CR4);
10407 u64 efer = vmcs_read64(GUEST_IA32_EFER);
10408 int i, n;
10409
10410 if (cpu_has_secondary_exec_ctrls())
10411 secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
10412
10413 pr_err("*** Guest State ***\n");
10414 pr_err("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
10415 vmcs_readl(GUEST_CR0), vmcs_readl(CR0_READ_SHADOW),
10416 vmcs_readl(CR0_GUEST_HOST_MASK));
10417 pr_err("CR4: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
10418 cr4, vmcs_readl(CR4_READ_SHADOW), vmcs_readl(CR4_GUEST_HOST_MASK));
10419 pr_err("CR3 = 0x%016lx\n", vmcs_readl(GUEST_CR3));
10420 if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT) &&
10421 (cr4 & X86_CR4_PAE) && !(efer & EFER_LMA))
10422 {
10423 pr_err("PDPTR0 = 0x%016llx PDPTR1 = 0x%016llx\n",
10424 vmcs_read64(GUEST_PDPTR0), vmcs_read64(GUEST_PDPTR1));
10425 pr_err("PDPTR2 = 0x%016llx PDPTR3 = 0x%016llx\n",
10426 vmcs_read64(GUEST_PDPTR2), vmcs_read64(GUEST_PDPTR3));
10427 }
10428 pr_err("RSP = 0x%016lx RIP = 0x%016lx\n",
10429 vmcs_readl(GUEST_RSP), vmcs_readl(GUEST_RIP));
10430 pr_err("RFLAGS=0x%08lx DR7 = 0x%016lx\n",
10431 vmcs_readl(GUEST_RFLAGS), vmcs_readl(GUEST_DR7));
10432 pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n",
10433 vmcs_readl(GUEST_SYSENTER_ESP),
10434 vmcs_read32(GUEST_SYSENTER_CS), vmcs_readl(GUEST_SYSENTER_EIP));
10435 vmx_dump_sel("CS: ", GUEST_CS_SELECTOR);
10436 vmx_dump_sel("DS: ", GUEST_DS_SELECTOR);
10437 vmx_dump_sel("SS: ", GUEST_SS_SELECTOR);
10438 vmx_dump_sel("ES: ", GUEST_ES_SELECTOR);
10439 vmx_dump_sel("FS: ", GUEST_FS_SELECTOR);
10440 vmx_dump_sel("GS: ", GUEST_GS_SELECTOR);
10441 vmx_dump_dtsel("GDTR:", GUEST_GDTR_LIMIT);
10442 vmx_dump_sel("LDTR:", GUEST_LDTR_SELECTOR);
10443 vmx_dump_dtsel("IDTR:", GUEST_IDTR_LIMIT);
10444 vmx_dump_sel("TR: ", GUEST_TR_SELECTOR);
10445 if ((vmexit_ctl & (VM_EXIT_SAVE_IA32_PAT | VM_EXIT_SAVE_IA32_EFER)) ||
10446 (vmentry_ctl & (VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_IA32_EFER)))
10447 pr_err("EFER = 0x%016llx PAT = 0x%016llx\n",
10448 efer, vmcs_read64(GUEST_IA32_PAT));
10449 pr_err("DebugCtl = 0x%016llx DebugExceptions = 0x%016lx\n",
10450 vmcs_read64(GUEST_IA32_DEBUGCTL),
10451 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS));
10452 if (cpu_has_load_perf_global_ctrl &&
10453 vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
10454 pr_err("PerfGlobCtl = 0x%016llx\n",
10455 vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL));
10456 if (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS)
10457 pr_err("BndCfgS = 0x%016llx\n", vmcs_read64(GUEST_BNDCFGS));
10458 pr_err("Interruptibility = %08x ActivityState = %08x\n",
10459 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO),
10460 vmcs_read32(GUEST_ACTIVITY_STATE));
10461 if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
10462 pr_err("InterruptStatus = %04x\n",
10463 vmcs_read16(GUEST_INTR_STATUS));
10464
10465 pr_err("*** Host State ***\n");
10466 pr_err("RIP = 0x%016lx RSP = 0x%016lx\n",
10467 vmcs_readl(HOST_RIP), vmcs_readl(HOST_RSP));
10468 pr_err("CS=%04x SS=%04x DS=%04x ES=%04x FS=%04x GS=%04x TR=%04x\n",
10469 vmcs_read16(HOST_CS_SELECTOR), vmcs_read16(HOST_SS_SELECTOR),
10470 vmcs_read16(HOST_DS_SELECTOR), vmcs_read16(HOST_ES_SELECTOR),
10471 vmcs_read16(HOST_FS_SELECTOR), vmcs_read16(HOST_GS_SELECTOR),
10472 vmcs_read16(HOST_TR_SELECTOR));
10473 pr_err("FSBase=%016lx GSBase=%016lx TRBase=%016lx\n",
10474 vmcs_readl(HOST_FS_BASE), vmcs_readl(HOST_GS_BASE),
10475 vmcs_readl(HOST_TR_BASE));
10476 pr_err("GDTBase=%016lx IDTBase=%016lx\n",
10477 vmcs_readl(HOST_GDTR_BASE), vmcs_readl(HOST_IDTR_BASE));
10478 pr_err("CR0=%016lx CR3=%016lx CR4=%016lx\n",
10479 vmcs_readl(HOST_CR0), vmcs_readl(HOST_CR3),
10480 vmcs_readl(HOST_CR4));
10481 pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n",
10482 vmcs_readl(HOST_IA32_SYSENTER_ESP),
10483 vmcs_read32(HOST_IA32_SYSENTER_CS),
10484 vmcs_readl(HOST_IA32_SYSENTER_EIP));
10485 if (vmexit_ctl & (VM_EXIT_LOAD_IA32_PAT | VM_EXIT_LOAD_IA32_EFER))
10486 pr_err("EFER = 0x%016llx PAT = 0x%016llx\n",
10487 vmcs_read64(HOST_IA32_EFER),
10488 vmcs_read64(HOST_IA32_PAT));
10489 if (cpu_has_load_perf_global_ctrl &&
10490 vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
10491 pr_err("PerfGlobCtl = 0x%016llx\n",
10492 vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL));
10493
10494 pr_err("*** Control State ***\n");
10495 pr_err("PinBased=%08x CPUBased=%08x SecondaryExec=%08x\n",
10496 pin_based_exec_ctrl, cpu_based_exec_ctrl, secondary_exec_control);
10497 pr_err("EntryControls=%08x ExitControls=%08x\n", vmentry_ctl, vmexit_ctl);
10498 pr_err("ExceptionBitmap=%08x PFECmask=%08x PFECmatch=%08x\n",
10499 vmcs_read32(EXCEPTION_BITMAP),
10500 vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK),
10501 vmcs_read32(PAGE_FAULT_ERROR_CODE_MATCH));
10502 pr_err("VMEntry: intr_info=%08x errcode=%08x ilen=%08x\n",
10503 vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
10504 vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE),
10505 vmcs_read32(VM_ENTRY_INSTRUCTION_LEN));
10506 pr_err("VMExit: intr_info=%08x errcode=%08x ilen=%08x\n",
10507 vmcs_read32(VM_EXIT_INTR_INFO),
10508 vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
10509 vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
10510 pr_err(" reason=%08x qualification=%016lx\n",
10511 vmcs_read32(VM_EXIT_REASON), vmcs_readl(EXIT_QUALIFICATION));
10512 pr_err("IDTVectoring: info=%08x errcode=%08x\n",
10513 vmcs_read32(IDT_VECTORING_INFO_FIELD),
10514 vmcs_read32(IDT_VECTORING_ERROR_CODE));
10515 pr_err("TSC Offset = 0x%016llx\n", vmcs_read64(TSC_OFFSET));
10516 if (secondary_exec_control & SECONDARY_EXEC_TSC_SCALING)
10517 pr_err("TSC Multiplier = 0x%016llx\n",
10518 vmcs_read64(TSC_MULTIPLIER));
10519 if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW)
10520 pr_err("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD));
10521 if (pin_based_exec_ctrl & PIN_BASED_POSTED_INTR)
10522 pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV));
10523 if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT))
10524 pr_err("EPT pointer = 0x%016llx\n", vmcs_read64(EPT_POINTER));
10525 n = vmcs_read32(CR3_TARGET_COUNT);
10526 for (i = 0; i + 1 < n; i += 4)
10527 pr_err("CR3 target%u=%016lx target%u=%016lx\n",
10528 i, vmcs_readl(CR3_TARGET_VALUE0 + i * 2),
10529 i + 1, vmcs_readl(CR3_TARGET_VALUE0 + i * 2 + 2));
10530 if (i < n)
10531 pr_err("CR3 target%u=%016lx\n",
10532 i, vmcs_readl(CR3_TARGET_VALUE0 + i * 2));
10533 if (secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING)
10534 pr_err("PLE Gap=%08x Window=%08x\n",
10535 vmcs_read32(PLE_GAP), vmcs_read32(PLE_WINDOW));
10536 if (secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID)
10537 pr_err("Virtual processor ID = 0x%04x\n",
10538 vmcs_read16(VIRTUAL_PROCESSOR_ID));
10539}
10540
10541/*
10542 * The guest has exited. See if we can fix it or if we need userspace
10543 * assistance.
10544 */
10545static int vmx_handle_exit(struct kvm_vcpu *vcpu)
10546{
10547 struct vcpu_vmx *vmx = to_vmx(vcpu);
10548 u32 exit_reason = vmx->exit_reason;
10549 u32 vectoring_info = vmx->idt_vectoring_info;
10550
10551 trace_kvm_exit(exit_reason, vcpu, KVM_ISA_VMX);
10552
10553 /*
10554 * Flush logged GPAs PML buffer, this will make dirty_bitmap more
10555 * updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before
10556 * querying dirty_bitmap, we only need to kick all vcpus out of guest
10557 * mode as if vcpus is in root mode, the PML buffer must has been
10558 * flushed already.
10559 */
10560 if (enable_pml)
10561 vmx_flush_pml_buffer(vcpu);
10562
10563 /* If guest state is invalid, start emulating */
10564 if (vmx->emulation_required)
10565 return handle_invalid_guest_state(vcpu);
10566
10567 if (is_guest_mode(vcpu) && nested_vmx_exit_reflected(vcpu, exit_reason))
10568 return nested_vmx_reflect_vmexit(vcpu, exit_reason);
10569
10570 if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
10571 dump_vmcs();
10572 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
10573 vcpu->run->fail_entry.hardware_entry_failure_reason
10574 = exit_reason;
10575 return 0;
10576 }
10577
10578 if (unlikely(vmx->fail)) {
10579 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
10580 vcpu->run->fail_entry.hardware_entry_failure_reason
10581 = vmcs_read32(VM_INSTRUCTION_ERROR);
10582 return 0;
10583 }
10584
10585 /*
10586 * Note:
10587 * Do not try to fix EXIT_REASON_EPT_MISCONFIG if it caused by
10588 * delivery event since it indicates guest is accessing MMIO.
10589 * The vm-exit can be triggered again after return to guest that
10590 * will cause infinite loop.
10591 */
10592 if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
10593 (exit_reason != EXIT_REASON_EXCEPTION_NMI &&
10594 exit_reason != EXIT_REASON_EPT_VIOLATION &&
10595 exit_reason != EXIT_REASON_PML_FULL &&
10596 exit_reason != EXIT_REASON_TASK_SWITCH)) {
10597 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
10598 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV;
10599 vcpu->run->internal.ndata = 3;
10600 vcpu->run->internal.data[0] = vectoring_info;
10601 vcpu->run->internal.data[1] = exit_reason;
10602 vcpu->run->internal.data[2] = vcpu->arch.exit_qualification;
10603 if (exit_reason == EXIT_REASON_EPT_MISCONFIG) {
10604 vcpu->run->internal.ndata++;
10605 vcpu->run->internal.data[3] =
10606 vmcs_read64(GUEST_PHYSICAL_ADDRESS);
10607 }
10608 return 0;
10609 }
10610
10611 if (unlikely(!enable_vnmi &&
10612 vmx->loaded_vmcs->soft_vnmi_blocked)) {
10613 if (vmx_interrupt_allowed(vcpu)) {
10614 vmx->loaded_vmcs->soft_vnmi_blocked = 0;
10615 } else if (vmx->loaded_vmcs->vnmi_blocked_time > 1000000000LL &&
10616 vcpu->arch.nmi_pending) {
10617 /*
10618 * This CPU don't support us in finding the end of an
10619 * NMI-blocked window if the guest runs with IRQs
10620 * disabled. So we pull the trigger after 1 s of
10621 * futile waiting, but inform the user about this.
10622 */
10623 printk(KERN_WARNING "%s: Breaking out of NMI-blocked "
10624 "state on VCPU %d after 1 s timeout\n",
10625 __func__, vcpu->vcpu_id);
10626 vmx->loaded_vmcs->soft_vnmi_blocked = 0;
10627 }
10628 }
10629
10630 if (exit_reason < kvm_vmx_max_exit_handlers
10631 && kvm_vmx_exit_handlers[exit_reason])
10632 return kvm_vmx_exit_handlers[exit_reason](vcpu);
10633 else {
10634 vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n",
10635 exit_reason);
10636 kvm_queue_exception(vcpu, UD_VECTOR);
10637 return 1;
10638 }
10639}
10640
10641/*
10642 * Software based L1D cache flush which is used when microcode providing
10643 * the cache control MSR is not loaded.
10644 *
10645 * The L1D cache is 32 KiB on Nehalem and later microarchitectures, but to
10646 * flush it is required to read in 64 KiB because the replacement algorithm
10647 * is not exactly LRU. This could be sized at runtime via topology
10648 * information but as all relevant affected CPUs have 32KiB L1D cache size
10649 * there is no point in doing so.
10650 */
10651static void vmx_l1d_flush(struct kvm_vcpu *vcpu)
10652{
10653 int size = PAGE_SIZE << L1D_CACHE_ORDER;
10654
10655 /*
10656 * This code is only executed when the the flush mode is 'cond' or
10657 * 'always'
10658 */
10659 if (static_branch_likely(&vmx_l1d_flush_cond)) {
10660 bool flush_l1d;
10661
10662 /*
10663 * Clear the per-vcpu flush bit, it gets set again
10664 * either from vcpu_run() or from one of the unsafe
10665 * VMEXIT handlers.
10666 */
10667 flush_l1d = vcpu->arch.l1tf_flush_l1d;
10668 vcpu->arch.l1tf_flush_l1d = false;
10669
10670 /*
10671 * Clear the per-cpu flush bit, it gets set again from
10672 * the interrupt handlers.
10673 */
10674 flush_l1d |= kvm_get_cpu_l1tf_flush_l1d();
10675 kvm_clear_cpu_l1tf_flush_l1d();
10676
10677 if (!flush_l1d)
10678 return;
10679 }
10680
10681 vcpu->stat.l1d_flush++;
10682
10683 if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) {
10684 wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
10685 return;
10686 }
10687
10688 asm volatile(
10689 /* First ensure the pages are in the TLB */
10690 "xorl %%eax, %%eax\n"
10691 ".Lpopulate_tlb:\n\t"
10692 "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
10693 "addl $4096, %%eax\n\t"
10694 "cmpl %%eax, %[size]\n\t"
10695 "jne .Lpopulate_tlb\n\t"
10696 "xorl %%eax, %%eax\n\t"
10697 "cpuid\n\t"
10698 /* Now fill the cache */
10699 "xorl %%eax, %%eax\n"
10700 ".Lfill_cache:\n"
10701 "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
10702 "addl $64, %%eax\n\t"
10703 "cmpl %%eax, %[size]\n\t"
10704 "jne .Lfill_cache\n\t"
10705 "lfence\n"
10706 :: [flush_pages] "r" (vmx_l1d_flush_pages),
10707 [size] "r" (size)
10708 : "eax", "ebx", "ecx", "edx");
10709}
10710
10711static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
10712{
10713 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
10714
10715 if (is_guest_mode(vcpu) &&
10716 nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
10717 return;
10718
10719 if (irr == -1 || tpr < irr) {
10720 vmcs_write32(TPR_THRESHOLD, 0);
10721 return;
10722 }
10723
10724 vmcs_write32(TPR_THRESHOLD, irr);
10725}
10726
10727static void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
10728{
10729 u32 sec_exec_control;
10730
10731 if (!lapic_in_kernel(vcpu))
10732 return;
10733
10734 if (!flexpriority_enabled &&
10735 !cpu_has_vmx_virtualize_x2apic_mode())
10736 return;
10737
10738 /* Postpone execution until vmcs01 is the current VMCS. */
10739 if (is_guest_mode(vcpu)) {
10740 to_vmx(vcpu)->nested.change_vmcs01_virtual_apic_mode = true;
10741 return;
10742 }
10743
10744 sec_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
10745 sec_exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
10746 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
10747
10748 switch (kvm_get_apic_mode(vcpu)) {
10749 case LAPIC_MODE_INVALID:
10750 WARN_ONCE(true, "Invalid local APIC state");
10751 case LAPIC_MODE_DISABLED:
10752 break;
10753 case LAPIC_MODE_XAPIC:
10754 if (flexpriority_enabled) {
10755 sec_exec_control |=
10756 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
10757 vmx_flush_tlb(vcpu, true);
10758 }
10759 break;
10760 case LAPIC_MODE_X2APIC:
10761 if (cpu_has_vmx_virtualize_x2apic_mode())
10762 sec_exec_control |=
10763 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
10764 break;
10765 }
10766 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control);
10767
10768 vmx_update_msr_bitmap(vcpu);
10769}
10770
10771static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa)
10772{
10773 if (!is_guest_mode(vcpu)) {
10774 vmcs_write64(APIC_ACCESS_ADDR, hpa);
10775 vmx_flush_tlb(vcpu, true);
10776 }
10777}
10778
10779static void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr)
10780{
10781 u16 status;
10782 u8 old;
10783
10784 if (max_isr == -1)
10785 max_isr = 0;
10786
10787 status = vmcs_read16(GUEST_INTR_STATUS);
10788 old = status >> 8;
10789 if (max_isr != old) {
10790 status &= 0xff;
10791 status |= max_isr << 8;
10792 vmcs_write16(GUEST_INTR_STATUS, status);
10793 }
10794}
10795
10796static void vmx_set_rvi(int vector)
10797{
10798 u16 status;
10799 u8 old;
10800
10801 if (vector == -1)
10802 vector = 0;
10803
10804 status = vmcs_read16(GUEST_INTR_STATUS);
10805 old = (u8)status & 0xff;
10806 if ((u8)vector != old) {
10807 status &= ~0xff;
10808 status |= (u8)vector;
10809 vmcs_write16(GUEST_INTR_STATUS, status);
10810 }
10811}
10812
10813static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
10814{
10815 /*
10816 * When running L2, updating RVI is only relevant when
10817 * vmcs12 virtual-interrupt-delivery enabled.
10818 * However, it can be enabled only when L1 also
10819 * intercepts external-interrupts and in that case
10820 * we should not update vmcs02 RVI but instead intercept
10821 * interrupt. Therefore, do nothing when running L2.
10822 */
10823 if (!is_guest_mode(vcpu))
10824 vmx_set_rvi(max_irr);
10825}
10826
10827static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
10828{
10829 struct vcpu_vmx *vmx = to_vmx(vcpu);
10830 int max_irr;
10831 bool max_irr_updated;
10832
10833 WARN_ON(!vcpu->arch.apicv_active);
10834 if (pi_test_on(&vmx->pi_desc)) {
10835 pi_clear_on(&vmx->pi_desc);
10836 /*
10837 * IOMMU can write to PIR.ON, so the barrier matters even on UP.
10838 * But on x86 this is just a compiler barrier anyway.
10839 */
10840 smp_mb__after_atomic();
10841 max_irr_updated =
10842 kvm_apic_update_irr(vcpu, vmx->pi_desc.pir, &max_irr);
10843
10844 /*
10845 * If we are running L2 and L1 has a new pending interrupt
10846 * which can be injected, we should re-evaluate
10847 * what should be done with this new L1 interrupt.
10848 * If L1 intercepts external-interrupts, we should
10849 * exit from L2 to L1. Otherwise, interrupt should be
10850 * delivered directly to L2.
10851 */
10852 if (is_guest_mode(vcpu) && max_irr_updated) {
10853 if (nested_exit_on_intr(vcpu))
10854 kvm_vcpu_exiting_guest_mode(vcpu);
10855 else
10856 kvm_make_request(KVM_REQ_EVENT, vcpu);
10857 }
10858 } else {
10859 max_irr = kvm_lapic_find_highest_irr(vcpu);
10860 }
10861 vmx_hwapic_irr_update(vcpu, max_irr);
10862 return max_irr;
10863}
10864
10865static u8 vmx_has_apicv_interrupt(struct kvm_vcpu *vcpu)
10866{
10867 u8 rvi = vmx_get_rvi();
10868 u8 vppr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_PROCPRI);
10869
10870 return ((rvi & 0xf0) > (vppr & 0xf0));
10871}
10872
10873static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
10874{
10875 if (!kvm_vcpu_apicv_active(vcpu))
10876 return;
10877
10878 vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]);
10879 vmcs_write64(EOI_EXIT_BITMAP1, eoi_exit_bitmap[1]);
10880 vmcs_write64(EOI_EXIT_BITMAP2, eoi_exit_bitmap[2]);
10881 vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]);
10882}
10883
10884static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu)
10885{
10886 struct vcpu_vmx *vmx = to_vmx(vcpu);
10887
10888 pi_clear_on(&vmx->pi_desc);
10889 memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir));
10890}
10891
10892static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx)
10893{
10894 u32 exit_intr_info = 0;
10895 u16 basic_exit_reason = (u16)vmx->exit_reason;
10896
10897 if (!(basic_exit_reason == EXIT_REASON_MCE_DURING_VMENTRY
10898 || basic_exit_reason == EXIT_REASON_EXCEPTION_NMI))
10899 return;
10900
10901 if (!(vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY))
10902 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
10903 vmx->exit_intr_info = exit_intr_info;
10904
10905 /* if exit due to PF check for async PF */
10906 if (is_page_fault(exit_intr_info))
10907 vmx->vcpu.arch.apf.host_apf_reason = kvm_read_and_reset_pf_reason();
10908
10909 /* Handle machine checks before interrupts are enabled */
10910 if (basic_exit_reason == EXIT_REASON_MCE_DURING_VMENTRY ||
10911 is_machine_check(exit_intr_info))
10912 kvm_machine_check();
10913
10914 /* We need to handle NMIs before interrupts are enabled */
10915 if (is_nmi(exit_intr_info)) {
10916 kvm_before_interrupt(&vmx->vcpu);
10917 asm("int $2");
10918 kvm_after_interrupt(&vmx->vcpu);
10919 }
10920}
10921
10922static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
10923{
10924 u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
10925
10926 if ((exit_intr_info & (INTR_INFO_VALID_MASK | INTR_INFO_INTR_TYPE_MASK))
10927 == (INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR)) {
10928 unsigned int vector;
10929 unsigned long entry;
10930 gate_desc *desc;
10931 struct vcpu_vmx *vmx = to_vmx(vcpu);
10932#ifdef CONFIG_X86_64
10933 unsigned long tmp;
10934#endif
10935
10936 vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
10937 desc = (gate_desc *)vmx->host_idt_base + vector;
10938 entry = gate_offset(desc);
10939 asm volatile(
10940#ifdef CONFIG_X86_64
10941 "mov %%" _ASM_SP ", %[sp]\n\t"
10942 "and $0xfffffffffffffff0, %%" _ASM_SP "\n\t"
10943 "push $%c[ss]\n\t"
10944 "push %[sp]\n\t"
10945#endif
10946 "pushf\n\t"
10947 __ASM_SIZE(push) " $%c[cs]\n\t"
10948 CALL_NOSPEC
10949 :
10950#ifdef CONFIG_X86_64
10951 [sp]"=&r"(tmp),
10952#endif
10953 ASM_CALL_CONSTRAINT
10954 :
10955 THUNK_TARGET(entry),
10956 [ss]"i"(__KERNEL_DS),
10957 [cs]"i"(__KERNEL_CS)
10958 );
10959 }
10960}
10961STACK_FRAME_NON_STANDARD(vmx_handle_external_intr);
10962
10963static bool vmx_has_emulated_msr(int index)
10964{
10965 switch (index) {
10966 case MSR_IA32_SMBASE:
10967 /*
10968 * We cannot do SMM unless we can run the guest in big
10969 * real mode.
10970 */
10971 return enable_unrestricted_guest || emulate_invalid_guest_state;
10972 case MSR_AMD64_VIRT_SPEC_CTRL:
10973 /* This is AMD only. */
10974 return false;
10975 default:
10976 return true;
10977 }
10978}
10979
10980static bool vmx_mpx_supported(void)
10981{
10982 return (vmcs_config.vmexit_ctrl & VM_EXIT_CLEAR_BNDCFGS) &&
10983 (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_BNDCFGS);
10984}
10985
10986static bool vmx_xsaves_supported(void)
10987{
10988 return vmcs_config.cpu_based_2nd_exec_ctrl &
10989 SECONDARY_EXEC_XSAVES;
10990}
10991
10992static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
10993{
10994 u32 exit_intr_info;
10995 bool unblock_nmi;
10996 u8 vector;
10997 bool idtv_info_valid;
10998
10999 idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK;
11000
11001 if (enable_vnmi) {
11002 if (vmx->loaded_vmcs->nmi_known_unmasked)
11003 return;
11004 /*
11005 * Can't use vmx->exit_intr_info since we're not sure what
11006 * the exit reason is.
11007 */
11008 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
11009 unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
11010 vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
11011 /*
11012 * SDM 3: 27.7.1.2 (September 2008)
11013 * Re-set bit "block by NMI" before VM entry if vmexit caused by
11014 * a guest IRET fault.
11015 * SDM 3: 23.2.2 (September 2008)
11016 * Bit 12 is undefined in any of the following cases:
11017 * If the VM exit sets the valid bit in the IDT-vectoring
11018 * information field.
11019 * If the VM exit is due to a double fault.
11020 */
11021 if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi &&
11022 vector != DF_VECTOR && !idtv_info_valid)
11023 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
11024 GUEST_INTR_STATE_NMI);
11025 else
11026 vmx->loaded_vmcs->nmi_known_unmasked =
11027 !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO)
11028 & GUEST_INTR_STATE_NMI);
11029 } else if (unlikely(vmx->loaded_vmcs->soft_vnmi_blocked))
11030 vmx->loaded_vmcs->vnmi_blocked_time +=
11031 ktime_to_ns(ktime_sub(ktime_get(),
11032 vmx->loaded_vmcs->entry_time));
11033}
11034
11035static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu,
11036 u32 idt_vectoring_info,
11037 int instr_len_field,
11038 int error_code_field)
11039{
11040 u8 vector;
11041 int type;
11042 bool idtv_info_valid;
11043
11044 idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
11045
11046 vcpu->arch.nmi_injected = false;
11047 kvm_clear_exception_queue(vcpu);
11048 kvm_clear_interrupt_queue(vcpu);
11049
11050 if (!idtv_info_valid)
11051 return;
11052
11053 kvm_make_request(KVM_REQ_EVENT, vcpu);
11054
11055 vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK;
11056 type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK;
11057
11058 switch (type) {
11059 case INTR_TYPE_NMI_INTR:
11060 vcpu->arch.nmi_injected = true;
11061 /*
11062 * SDM 3: 27.7.1.2 (September 2008)
11063 * Clear bit "block by NMI" before VM entry if a NMI
11064 * delivery faulted.
11065 */
11066 vmx_set_nmi_mask(vcpu, false);
11067 break;
11068 case INTR_TYPE_SOFT_EXCEPTION:
11069 vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
11070 /* fall through */
11071 case INTR_TYPE_HARD_EXCEPTION:
11072 if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
11073 u32 err = vmcs_read32(error_code_field);
11074 kvm_requeue_exception_e(vcpu, vector, err);
11075 } else
11076 kvm_requeue_exception(vcpu, vector);
11077 break;
11078 case INTR_TYPE_SOFT_INTR:
11079 vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
11080 /* fall through */
11081 case INTR_TYPE_EXT_INTR:
11082 kvm_queue_interrupt(vcpu, vector, type == INTR_TYPE_SOFT_INTR);
11083 break;
11084 default:
11085 break;
11086 }
11087}
11088
11089static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
11090{
11091 __vmx_complete_interrupts(&vmx->vcpu, vmx->idt_vectoring_info,
11092 VM_EXIT_INSTRUCTION_LEN,
11093 IDT_VECTORING_ERROR_CODE);
11094}
11095
11096static void vmx_cancel_injection(struct kvm_vcpu *vcpu)
11097{
11098 __vmx_complete_interrupts(vcpu,
11099 vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
11100 VM_ENTRY_INSTRUCTION_LEN,
11101 VM_ENTRY_EXCEPTION_ERROR_CODE);
11102
11103 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
11104}
11105
11106static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
11107{
11108 int i, nr_msrs;
11109 struct perf_guest_switch_msr *msrs;
11110
11111 msrs = perf_guest_get_msrs(&nr_msrs);
11112
11113 if (!msrs)
11114 return;
11115
11116 for (i = 0; i < nr_msrs; i++)
11117 if (msrs[i].host == msrs[i].guest)
11118 clear_atomic_switch_msr(vmx, msrs[i].msr);
11119 else
11120 add_atomic_switch_msr(vmx, msrs[i].msr, msrs[i].guest,
11121 msrs[i].host, false);
11122}
11123
11124static void vmx_arm_hv_timer(struct vcpu_vmx *vmx, u32 val)
11125{
11126 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, val);
11127 if (!vmx->loaded_vmcs->hv_timer_armed)
11128 vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL,
11129 PIN_BASED_VMX_PREEMPTION_TIMER);
11130 vmx->loaded_vmcs->hv_timer_armed = true;
11131}
11132
11133static void vmx_update_hv_timer(struct kvm_vcpu *vcpu)
11134{
11135 struct vcpu_vmx *vmx = to_vmx(vcpu);
11136 u64 tscl;
11137 u32 delta_tsc;
11138
11139 if (vmx->req_immediate_exit) {
11140 vmx_arm_hv_timer(vmx, 0);
11141 return;
11142 }
11143
11144 if (vmx->hv_deadline_tsc != -1) {
11145 tscl = rdtsc();
11146 if (vmx->hv_deadline_tsc > tscl)
11147 /* set_hv_timer ensures the delta fits in 32-bits */
11148 delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >>
11149 cpu_preemption_timer_multi);
11150 else
11151 delta_tsc = 0;
11152
11153 vmx_arm_hv_timer(vmx, delta_tsc);
11154 return;
11155 }
11156
11157 if (vmx->loaded_vmcs->hv_timer_armed)
11158 vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL,
11159 PIN_BASED_VMX_PREEMPTION_TIMER);
11160 vmx->loaded_vmcs->hv_timer_armed = false;
11161}
11162
11163static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
11164{
11165 struct vcpu_vmx *vmx = to_vmx(vcpu);
11166 unsigned long cr3, cr4, evmcs_rsp;
11167
11168 /* Record the guest's net vcpu time for enforced NMI injections. */
11169 if (unlikely(!enable_vnmi &&
11170 vmx->loaded_vmcs->soft_vnmi_blocked))
11171 vmx->loaded_vmcs->entry_time = ktime_get();
11172
11173 /* Don't enter VMX if guest state is invalid, let the exit handler
11174 start emulation until we arrive back to a valid state */
11175 if (vmx->emulation_required)
11176 return;
11177
11178 if (vmx->ple_window_dirty) {
11179 vmx->ple_window_dirty = false;
11180 vmcs_write32(PLE_WINDOW, vmx->ple_window);
11181 }
11182
11183 if (vmx->nested.need_vmcs12_sync) {
11184 /*
11185 * hv_evmcs may end up being not mapped after migration (when
11186 * L2 was running), map it here to make sure vmcs12 changes are
11187 * properly reflected.
11188 */
11189 if (vmx->nested.enlightened_vmcs_enabled &&
11190 !vmx->nested.hv_evmcs)
11191 nested_vmx_handle_enlightened_vmptrld(vcpu, false);
11192
11193 if (vmx->nested.hv_evmcs) {
11194 copy_vmcs12_to_enlightened(vmx);
11195 /* All fields are clean */
11196 vmx->nested.hv_evmcs->hv_clean_fields |=
11197 HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
11198 } else {
11199 copy_vmcs12_to_shadow(vmx);
11200 }
11201 vmx->nested.need_vmcs12_sync = false;
11202 }
11203
11204 if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty))
11205 vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
11206 if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
11207 vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
11208
11209 cr3 = __get_current_cr3_fast();
11210 if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) {
11211 vmcs_writel(HOST_CR3, cr3);
11212 vmx->loaded_vmcs->host_state.cr3 = cr3;
11213 }
11214
11215 cr4 = cr4_read_shadow();
11216 if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) {
11217 vmcs_writel(HOST_CR4, cr4);
11218 vmx->loaded_vmcs->host_state.cr4 = cr4;
11219 }
11220
11221 /* When single-stepping over STI and MOV SS, we must clear the
11222 * corresponding interruptibility bits in the guest state. Otherwise
11223 * vmentry fails as it then expects bit 14 (BS) in pending debug
11224 * exceptions being set, but that's not correct for the guest debugging
11225 * case. */
11226 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
11227 vmx_set_interrupt_shadow(vcpu, 0);
11228
11229 if (static_cpu_has(X86_FEATURE_PKU) &&
11230 kvm_read_cr4_bits(vcpu, X86_CR4_PKE) &&
11231 vcpu->arch.pkru != vmx->host_pkru)
11232 __write_pkru(vcpu->arch.pkru);
11233
11234 atomic_switch_perf_msrs(vmx);
11235
11236 vmx_update_hv_timer(vcpu);
11237
11238 /*
11239 * If this vCPU has touched SPEC_CTRL, restore the guest's value if
11240 * it's non-zero. Since vmentry is serialising on affected CPUs, there
11241 * is no need to worry about the conditional branch over the wrmsr
11242 * being speculatively taken.
11243 */
11244 x86_spec_ctrl_set_guest(vmx->spec_ctrl, 0);
11245
11246 vmx->__launched = vmx->loaded_vmcs->launched;
11247
11248 evmcs_rsp = static_branch_unlikely(&enable_evmcs) ?
11249 (unsigned long)&current_evmcs->host_rsp : 0;
11250
11251 if (static_branch_unlikely(&vmx_l1d_should_flush))
11252 vmx_l1d_flush(vcpu);
11253
11254 asm(
11255 /* Store host registers */
11256 "push %%" _ASM_DX "; push %%" _ASM_BP ";"
11257 "push %%" _ASM_CX " \n\t" /* placeholder for guest rcx */
11258 "push %%" _ASM_CX " \n\t"
11259 "cmp %%" _ASM_SP ", %c[host_rsp](%0) \n\t"
11260 "je 1f \n\t"
11261 "mov %%" _ASM_SP ", %c[host_rsp](%0) \n\t"
11262 /* Avoid VMWRITE when Enlightened VMCS is in use */
11263 "test %%" _ASM_SI ", %%" _ASM_SI " \n\t"
11264 "jz 2f \n\t"
11265 "mov %%" _ASM_SP ", (%%" _ASM_SI ") \n\t"
11266 "jmp 1f \n\t"
11267 "2: \n\t"
11268 __ex("vmwrite %%" _ASM_SP ", %%" _ASM_DX) "\n\t"
11269 "1: \n\t"
11270 /* Reload cr2 if changed */
11271 "mov %c[cr2](%0), %%" _ASM_AX " \n\t"
11272 "mov %%cr2, %%" _ASM_DX " \n\t"
11273 "cmp %%" _ASM_AX ", %%" _ASM_DX " \n\t"
11274 "je 3f \n\t"
11275 "mov %%" _ASM_AX", %%cr2 \n\t"
11276 "3: \n\t"
11277 /* Check if vmlaunch of vmresume is needed */
11278 "cmpl $0, %c[launched](%0) \n\t"
11279 /* Load guest registers. Don't clobber flags. */
11280 "mov %c[rax](%0), %%" _ASM_AX " \n\t"
11281 "mov %c[rbx](%0), %%" _ASM_BX " \n\t"
11282 "mov %c[rdx](%0), %%" _ASM_DX " \n\t"
11283 "mov %c[rsi](%0), %%" _ASM_SI " \n\t"
11284 "mov %c[rdi](%0), %%" _ASM_DI " \n\t"
11285 "mov %c[rbp](%0), %%" _ASM_BP " \n\t"
11286#ifdef CONFIG_X86_64
11287 "mov %c[r8](%0), %%r8 \n\t"
11288 "mov %c[r9](%0), %%r9 \n\t"
11289 "mov %c[r10](%0), %%r10 \n\t"
11290 "mov %c[r11](%0), %%r11 \n\t"
11291 "mov %c[r12](%0), %%r12 \n\t"
11292 "mov %c[r13](%0), %%r13 \n\t"
11293 "mov %c[r14](%0), %%r14 \n\t"
11294 "mov %c[r15](%0), %%r15 \n\t"
11295#endif
11296 "mov %c[rcx](%0), %%" _ASM_CX " \n\t" /* kills %0 (ecx) */
11297
11298 /* Enter guest mode */
11299 "jne 1f \n\t"
11300 __ex("vmlaunch") "\n\t"
11301 "jmp 2f \n\t"
11302 "1: " __ex("vmresume") "\n\t"
11303 "2: "
11304 /* Save guest registers, load host registers, keep flags */
11305 "mov %0, %c[wordsize](%%" _ASM_SP ") \n\t"
11306 "pop %0 \n\t"
11307 "setbe %c[fail](%0)\n\t"
11308 "mov %%" _ASM_AX ", %c[rax](%0) \n\t"
11309 "mov %%" _ASM_BX ", %c[rbx](%0) \n\t"
11310 __ASM_SIZE(pop) " %c[rcx](%0) \n\t"
11311 "mov %%" _ASM_DX ", %c[rdx](%0) \n\t"
11312 "mov %%" _ASM_SI ", %c[rsi](%0) \n\t"
11313 "mov %%" _ASM_DI ", %c[rdi](%0) \n\t"
11314 "mov %%" _ASM_BP ", %c[rbp](%0) \n\t"
11315#ifdef CONFIG_X86_64
11316 "mov %%r8, %c[r8](%0) \n\t"
11317 "mov %%r9, %c[r9](%0) \n\t"
11318 "mov %%r10, %c[r10](%0) \n\t"
11319 "mov %%r11, %c[r11](%0) \n\t"
11320 "mov %%r12, %c[r12](%0) \n\t"
11321 "mov %%r13, %c[r13](%0) \n\t"
11322 "mov %%r14, %c[r14](%0) \n\t"
11323 "mov %%r15, %c[r15](%0) \n\t"
11324 /*
11325 * Clear host registers marked as clobbered to prevent
11326 * speculative use.
11327 */
11328 "xor %%r8d, %%r8d \n\t"
11329 "xor %%r9d, %%r9d \n\t"
11330 "xor %%r10d, %%r10d \n\t"
11331 "xor %%r11d, %%r11d \n\t"
11332 "xor %%r12d, %%r12d \n\t"
11333 "xor %%r13d, %%r13d \n\t"
11334 "xor %%r14d, %%r14d \n\t"
11335 "xor %%r15d, %%r15d \n\t"
11336#endif
11337 "mov %%cr2, %%" _ASM_AX " \n\t"
11338 "mov %%" _ASM_AX ", %c[cr2](%0) \n\t"
11339
11340 "xor %%eax, %%eax \n\t"
11341 "xor %%ebx, %%ebx \n\t"
11342 "xor %%esi, %%esi \n\t"
11343 "xor %%edi, %%edi \n\t"
11344 "pop %%" _ASM_BP "; pop %%" _ASM_DX " \n\t"
11345 ".pushsection .rodata \n\t"
11346 ".global vmx_return \n\t"
11347 "vmx_return: " _ASM_PTR " 2b \n\t"
11348 ".popsection"
11349 : : "c"(vmx), "d"((unsigned long)HOST_RSP), "S"(evmcs_rsp),
11350 [launched]"i"(offsetof(struct vcpu_vmx, __launched)),
11351 [fail]"i"(offsetof(struct vcpu_vmx, fail)),
11352 [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)),
11353 [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])),
11354 [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])),
11355 [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])),
11356 [rdx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDX])),
11357 [rsi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RSI])),
11358 [rdi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDI])),
11359 [rbp]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBP])),
11360#ifdef CONFIG_X86_64
11361 [r8]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R8])),
11362 [r9]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R9])),
11363 [r10]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R10])),
11364 [r11]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R11])),
11365 [r12]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R12])),
11366 [r13]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R13])),
11367 [r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])),
11368 [r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])),
11369#endif
11370 [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)),
11371 [wordsize]"i"(sizeof(ulong))
11372 : "cc", "memory"
11373#ifdef CONFIG_X86_64
11374 , "rax", "rbx", "rdi"
11375 , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
11376#else
11377 , "eax", "ebx", "edi"
11378#endif
11379 );
11380
11381 /*
11382 * We do not use IBRS in the kernel. If this vCPU has used the
11383 * SPEC_CTRL MSR it may have left it on; save the value and
11384 * turn it off. This is much more efficient than blindly adding
11385 * it to the atomic save/restore list. Especially as the former
11386 * (Saving guest MSRs on vmexit) doesn't even exist in KVM.
11387 *
11388 * For non-nested case:
11389 * If the L01 MSR bitmap does not intercept the MSR, then we need to
11390 * save it.
11391 *
11392 * For nested case:
11393 * If the L02 MSR bitmap does not intercept the MSR, then we need to
11394 * save it.
11395 */
11396 if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)))
11397 vmx->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
11398
11399 x86_spec_ctrl_restore_host(vmx->spec_ctrl, 0);
11400
11401 /* Eliminate branch target predictions from guest mode */
11402 vmexit_fill_RSB();
11403
11404 /* All fields are clean at this point */
11405 if (static_branch_unlikely(&enable_evmcs))
11406 current_evmcs->hv_clean_fields |=
11407 HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
11408
11409 /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
11410 if (vmx->host_debugctlmsr)
11411 update_debugctlmsr(vmx->host_debugctlmsr);
11412
11413#ifndef CONFIG_X86_64
11414 /*
11415 * The sysexit path does not restore ds/es, so we must set them to
11416 * a reasonable value ourselves.
11417 *
11418 * We can't defer this to vmx_prepare_switch_to_host() since that
11419 * function may be executed in interrupt context, which saves and
11420 * restore segments around it, nullifying its effect.
11421 */
11422 loadsegment(ds, __USER_DS);
11423 loadsegment(es, __USER_DS);
11424#endif
11425
11426 vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)
11427 | (1 << VCPU_EXREG_RFLAGS)
11428 | (1 << VCPU_EXREG_PDPTR)
11429 | (1 << VCPU_EXREG_SEGMENTS)
11430 | (1 << VCPU_EXREG_CR3));
11431 vcpu->arch.regs_dirty = 0;
11432
11433 /*
11434 * eager fpu is enabled if PKEY is supported and CR4 is switched
11435 * back on host, so it is safe to read guest PKRU from current
11436 * XSAVE.
11437 */
11438 if (static_cpu_has(X86_FEATURE_PKU) &&
11439 kvm_read_cr4_bits(vcpu, X86_CR4_PKE)) {
11440 vcpu->arch.pkru = __read_pkru();
11441 if (vcpu->arch.pkru != vmx->host_pkru)
11442 __write_pkru(vmx->host_pkru);
11443 }
11444
11445 vmx->nested.nested_run_pending = 0;
11446 vmx->idt_vectoring_info = 0;
11447
11448 vmx->exit_reason = vmx->fail ? 0xdead : vmcs_read32(VM_EXIT_REASON);
11449 if (vmx->fail || (vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY))
11450 return;
11451
11452 vmx->loaded_vmcs->launched = 1;
11453 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
11454
11455 vmx_complete_atomic_exit(vmx);
11456 vmx_recover_nmi_blocking(vmx);
11457 vmx_complete_interrupts(vmx);
11458}
11459STACK_FRAME_NON_STANDARD(vmx_vcpu_run);
11460
11461static struct kvm *vmx_vm_alloc(void)
11462{
11463 struct kvm_vmx *kvm_vmx = vzalloc(sizeof(struct kvm_vmx));
11464 return &kvm_vmx->kvm;
11465}
11466
11467static void vmx_vm_free(struct kvm *kvm)
11468{
11469 vfree(to_kvm_vmx(kvm));
11470}
11471
11472static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
11473{
11474 struct vcpu_vmx *vmx = to_vmx(vcpu);
11475 int cpu;
11476
11477 if (vmx->loaded_vmcs == vmcs)
11478 return;
11479
11480 cpu = get_cpu();
11481 vmx_vcpu_put(vcpu);
11482 vmx->loaded_vmcs = vmcs;
11483 vmx_vcpu_load(vcpu, cpu);
11484 put_cpu();
11485
11486 vm_entry_controls_reset_shadow(vmx);
11487 vm_exit_controls_reset_shadow(vmx);
11488 vmx_segment_cache_clear(vmx);
11489}
11490
11491/*
11492 * Ensure that the current vmcs of the logical processor is the
11493 * vmcs01 of the vcpu before calling free_nested().
11494 */
11495static void vmx_free_vcpu_nested(struct kvm_vcpu *vcpu)
11496{
11497 vcpu_load(vcpu);
11498 vmx_switch_vmcs(vcpu, &to_vmx(vcpu)->vmcs01);
11499 free_nested(vcpu);
11500 vcpu_put(vcpu);
11501}
11502
11503static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
11504{
11505 struct vcpu_vmx *vmx = to_vmx(vcpu);
11506
11507 if (enable_pml)
11508 vmx_destroy_pml_buffer(vmx);
11509 free_vpid(vmx->vpid);
11510 leave_guest_mode(vcpu);
11511 vmx_free_vcpu_nested(vcpu);
11512 free_loaded_vmcs(vmx->loaded_vmcs);
11513 kfree(vmx->guest_msrs);
11514 kvm_vcpu_uninit(vcpu);
11515 kmem_cache_free(kvm_vcpu_cache, vmx);
11516}
11517
11518static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
11519{
11520 int err;
11521 struct vcpu_vmx *vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
11522 unsigned long *msr_bitmap;
11523 int cpu;
11524
11525 if (!vmx)
11526 return ERR_PTR(-ENOMEM);
11527
11528 vmx->vpid = allocate_vpid();
11529
11530 err = kvm_vcpu_init(&vmx->vcpu, kvm, id);
11531 if (err)
11532 goto free_vcpu;
11533
11534 err = -ENOMEM;
11535
11536 /*
11537 * If PML is turned on, failure on enabling PML just results in failure
11538 * of creating the vcpu, therefore we can simplify PML logic (by
11539 * avoiding dealing with cases, such as enabling PML partially on vcpus
11540 * for the guest, etc.
11541 */
11542 if (enable_pml) {
11543 vmx->pml_pg = alloc_page(GFP_KERNEL | __GFP_ZERO);
11544 if (!vmx->pml_pg)
11545 goto uninit_vcpu;
11546 }
11547
11548 vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
11549 BUILD_BUG_ON(ARRAY_SIZE(vmx_msr_index) * sizeof(vmx->guest_msrs[0])
11550 > PAGE_SIZE);
11551
11552 if (!vmx->guest_msrs)
11553 goto free_pml;
11554
11555 err = alloc_loaded_vmcs(&vmx->vmcs01);
11556 if (err < 0)
11557 goto free_msrs;
11558
11559 msr_bitmap = vmx->vmcs01.msr_bitmap;
11560 vmx_disable_intercept_for_msr(msr_bitmap, MSR_FS_BASE, MSR_TYPE_RW);
11561 vmx_disable_intercept_for_msr(msr_bitmap, MSR_GS_BASE, MSR_TYPE_RW);
11562 vmx_disable_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
11563 vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
11564 vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW);
11565 vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW);
11566 vmx->msr_bitmap_mode = 0;
11567
11568 vmx->loaded_vmcs = &vmx->vmcs01;
11569 cpu = get_cpu();
11570 vmx_vcpu_load(&vmx->vcpu, cpu);
11571 vmx->vcpu.cpu = cpu;
11572 vmx_vcpu_setup(vmx);
11573 vmx_vcpu_put(&vmx->vcpu);
11574 put_cpu();
11575 if (cpu_need_virtualize_apic_accesses(&vmx->vcpu)) {
11576 err = alloc_apic_access_page(kvm);
11577 if (err)
11578 goto free_vmcs;
11579 }
11580
11581 if (enable_ept && !enable_unrestricted_guest) {
11582 err = init_rmode_identity_map(kvm);
11583 if (err)
11584 goto free_vmcs;
11585 }
11586
11587 if (nested)
11588 nested_vmx_setup_ctls_msrs(&vmx->nested.msrs,
11589 kvm_vcpu_apicv_active(&vmx->vcpu));
11590
11591 vmx->nested.posted_intr_nv = -1;
11592 vmx->nested.current_vmptr = -1ull;
11593
11594 vmx->msr_ia32_feature_control_valid_bits = FEATURE_CONTROL_LOCKED;
11595
11596 /*
11597 * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR
11598 * or POSTED_INTR_WAKEUP_VECTOR.
11599 */
11600 vmx->pi_desc.nv = POSTED_INTR_VECTOR;
11601 vmx->pi_desc.sn = 1;
11602
11603 return &vmx->vcpu;
11604
11605free_vmcs:
11606 free_loaded_vmcs(vmx->loaded_vmcs);
11607free_msrs:
11608 kfree(vmx->guest_msrs);
11609free_pml:
11610 vmx_destroy_pml_buffer(vmx);
11611uninit_vcpu:
11612 kvm_vcpu_uninit(&vmx->vcpu);
11613free_vcpu:
11614 free_vpid(vmx->vpid);
11615 kmem_cache_free(kvm_vcpu_cache, vmx);
11616 return ERR_PTR(err);
11617}
11618
11619#define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html for details.\n"
11620#define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html for details.\n"
11621
11622static int vmx_vm_init(struct kvm *kvm)
11623{
11624 spin_lock_init(&to_kvm_vmx(kvm)->ept_pointer_lock);
11625
11626 if (!ple_gap)
11627 kvm->arch.pause_in_guest = true;
11628
11629 if (boot_cpu_has(X86_BUG_L1TF) && enable_ept) {
11630 switch (l1tf_mitigation) {
11631 case L1TF_MITIGATION_OFF:
11632 case L1TF_MITIGATION_FLUSH_NOWARN:
11633 /* 'I explicitly don't care' is set */
11634 break;
11635 case L1TF_MITIGATION_FLUSH:
11636 case L1TF_MITIGATION_FLUSH_NOSMT:
11637 case L1TF_MITIGATION_FULL:
11638 /*
11639 * Warn upon starting the first VM in a potentially
11640 * insecure environment.
11641 */
11642 if (cpu_smt_control == CPU_SMT_ENABLED)
11643 pr_warn_once(L1TF_MSG_SMT);
11644 if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER)
11645 pr_warn_once(L1TF_MSG_L1D);
11646 break;
11647 case L1TF_MITIGATION_FULL_FORCE:
11648 /* Flush is enforced */
11649 break;
11650 }
11651 }
11652 return 0;
11653}
11654
11655static void __init vmx_check_processor_compat(void *rtn)
11656{
11657 struct vmcs_config vmcs_conf;
11658
11659 *(int *)rtn = 0;
11660 if (setup_vmcs_config(&vmcs_conf) < 0)
11661 *(int *)rtn = -EIO;
11662 nested_vmx_setup_ctls_msrs(&vmcs_conf.nested, enable_apicv);
11663 if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) {
11664 printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n",
11665 smp_processor_id());
11666 *(int *)rtn = -EIO;
11667 }
11668}
11669
11670static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
11671{
11672 u8 cache;
11673 u64 ipat = 0;
11674
11675 /* For VT-d and EPT combination
11676 * 1. MMIO: always map as UC
11677 * 2. EPT with VT-d:
11678 * a. VT-d without snooping control feature: can't guarantee the
11679 * result, try to trust guest.
11680 * b. VT-d with snooping control feature: snooping control feature of
11681 * VT-d engine can guarantee the cache correctness. Just set it
11682 * to WB to keep consistent with host. So the same as item 3.
11683 * 3. EPT without VT-d: always map as WB and set IPAT=1 to keep
11684 * consistent with host MTRR
11685 */
11686 if (is_mmio) {
11687 cache = MTRR_TYPE_UNCACHABLE;
11688 goto exit;
11689 }
11690
11691 if (!kvm_arch_has_noncoherent_dma(vcpu->kvm)) {
11692 ipat = VMX_EPT_IPAT_BIT;
11693 cache = MTRR_TYPE_WRBACK;
11694 goto exit;
11695 }
11696
11697 if (kvm_read_cr0(vcpu) & X86_CR0_CD) {
11698 ipat = VMX_EPT_IPAT_BIT;
11699 if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
11700 cache = MTRR_TYPE_WRBACK;
11701 else
11702 cache = MTRR_TYPE_UNCACHABLE;
11703 goto exit;
11704 }
11705
11706 cache = kvm_mtrr_get_guest_memory_type(vcpu, gfn);
11707
11708exit:
11709 return (cache << VMX_EPT_MT_EPTE_SHIFT) | ipat;
11710}
11711
11712static int vmx_get_lpage_level(void)
11713{
11714 if (enable_ept && !cpu_has_vmx_ept_1g_page())
11715 return PT_DIRECTORY_LEVEL;
11716 else
11717 /* For shadow and EPT supported 1GB page */
11718 return PT_PDPE_LEVEL;
11719}
11720
11721static void vmcs_set_secondary_exec_control(u32 new_ctl)
11722{
11723 /*
11724 * These bits in the secondary execution controls field
11725 * are dynamic, the others are mostly based on the hypervisor
11726 * architecture and the guest's CPUID. Do not touch the
11727 * dynamic bits.
11728 */
11729 u32 mask =
11730 SECONDARY_EXEC_SHADOW_VMCS |
11731 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
11732 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
11733 SECONDARY_EXEC_DESC;
11734
11735 u32 cur_ctl = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
11736
11737 vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
11738 (new_ctl & ~mask) | (cur_ctl & mask));
11739}
11740
11741/*
11742 * Generate MSR_IA32_VMX_CR{0,4}_FIXED1 according to CPUID. Only set bits
11743 * (indicating "allowed-1") if they are supported in the guest's CPUID.
11744 */
11745static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu)
11746{
11747 struct vcpu_vmx *vmx = to_vmx(vcpu);
11748 struct kvm_cpuid_entry2 *entry;
11749
11750 vmx->nested.msrs.cr0_fixed1 = 0xffffffff;
11751 vmx->nested.msrs.cr4_fixed1 = X86_CR4_PCE;
11752
11753#define cr4_fixed1_update(_cr4_mask, _reg, _cpuid_mask) do { \
11754 if (entry && (entry->_reg & (_cpuid_mask))) \
11755 vmx->nested.msrs.cr4_fixed1 |= (_cr4_mask); \
11756} while (0)
11757
11758 entry = kvm_find_cpuid_entry(vcpu, 0x1, 0);
11759 cr4_fixed1_update(X86_CR4_VME, edx, bit(X86_FEATURE_VME));
11760 cr4_fixed1_update(X86_CR4_PVI, edx, bit(X86_FEATURE_VME));
11761 cr4_fixed1_update(X86_CR4_TSD, edx, bit(X86_FEATURE_TSC));
11762 cr4_fixed1_update(X86_CR4_DE, edx, bit(X86_FEATURE_DE));
11763 cr4_fixed1_update(X86_CR4_PSE, edx, bit(X86_FEATURE_PSE));
11764 cr4_fixed1_update(X86_CR4_PAE, edx, bit(X86_FEATURE_PAE));
11765 cr4_fixed1_update(X86_CR4_MCE, edx, bit(X86_FEATURE_MCE));
11766 cr4_fixed1_update(X86_CR4_PGE, edx, bit(X86_FEATURE_PGE));
11767 cr4_fixed1_update(X86_CR4_OSFXSR, edx, bit(X86_FEATURE_FXSR));
11768 cr4_fixed1_update(X86_CR4_OSXMMEXCPT, edx, bit(X86_FEATURE_XMM));
11769 cr4_fixed1_update(X86_CR4_VMXE, ecx, bit(X86_FEATURE_VMX));
11770 cr4_fixed1_update(X86_CR4_SMXE, ecx, bit(X86_FEATURE_SMX));
11771 cr4_fixed1_update(X86_CR4_PCIDE, ecx, bit(X86_FEATURE_PCID));
11772 cr4_fixed1_update(X86_CR4_OSXSAVE, ecx, bit(X86_FEATURE_XSAVE));
11773
11774 entry = kvm_find_cpuid_entry(vcpu, 0x7, 0);
11775 cr4_fixed1_update(X86_CR4_FSGSBASE, ebx, bit(X86_FEATURE_FSGSBASE));
11776 cr4_fixed1_update(X86_CR4_SMEP, ebx, bit(X86_FEATURE_SMEP));
11777 cr4_fixed1_update(X86_CR4_SMAP, ebx, bit(X86_FEATURE_SMAP));
11778 cr4_fixed1_update(X86_CR4_PKE, ecx, bit(X86_FEATURE_PKU));
11779 cr4_fixed1_update(X86_CR4_UMIP, ecx, bit(X86_FEATURE_UMIP));
11780
11781#undef cr4_fixed1_update
11782}
11783
11784static void nested_vmx_entry_exit_ctls_update(struct kvm_vcpu *vcpu)
11785{
11786 struct vcpu_vmx *vmx = to_vmx(vcpu);
11787
11788 if (kvm_mpx_supported()) {
11789 bool mpx_enabled = guest_cpuid_has(vcpu, X86_FEATURE_MPX);
11790
11791 if (mpx_enabled) {
11792 vmx->nested.msrs.entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS;
11793 vmx->nested.msrs.exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
11794 } else {
11795 vmx->nested.msrs.entry_ctls_high &= ~VM_ENTRY_LOAD_BNDCFGS;
11796 vmx->nested.msrs.exit_ctls_high &= ~VM_EXIT_CLEAR_BNDCFGS;
11797 }
11798 }
11799}
11800
11801static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
11802{
11803 struct vcpu_vmx *vmx = to_vmx(vcpu);
11804
11805 if (cpu_has_secondary_exec_ctrls()) {
11806 vmx_compute_secondary_exec_control(vmx);
11807 vmcs_set_secondary_exec_control(vmx->secondary_exec_control);
11808 }
11809
11810 if (nested_vmx_allowed(vcpu))
11811 to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
11812 FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
11813 else
11814 to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
11815 ~FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
11816
11817 if (nested_vmx_allowed(vcpu)) {
11818 nested_vmx_cr_fixed1_bits_update(vcpu);
11819 nested_vmx_entry_exit_ctls_update(vcpu);
11820 }
11821}
11822
11823static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
11824{
11825 if (func == 1 && nested)
11826 entry->ecx |= bit(X86_FEATURE_VMX);
11827}
11828
11829static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
11830 struct x86_exception *fault)
11831{
11832 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
11833 struct vcpu_vmx *vmx = to_vmx(vcpu);
11834 u32 exit_reason;
11835 unsigned long exit_qualification = vcpu->arch.exit_qualification;
11836
11837 if (vmx->nested.pml_full) {
11838 exit_reason = EXIT_REASON_PML_FULL;
11839 vmx->nested.pml_full = false;
11840 exit_qualification &= INTR_INFO_UNBLOCK_NMI;
11841 } else if (fault->error_code & PFERR_RSVD_MASK)
11842 exit_reason = EXIT_REASON_EPT_MISCONFIG;
11843 else
11844 exit_reason = EXIT_REASON_EPT_VIOLATION;
11845
11846 nested_vmx_vmexit(vcpu, exit_reason, 0, exit_qualification);
11847 vmcs12->guest_physical_address = fault->address;
11848}
11849
11850static bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu)
11851{
11852 return nested_ept_get_cr3(vcpu) & VMX_EPTP_AD_ENABLE_BIT;
11853}
11854
11855/* Callbacks for nested_ept_init_mmu_context: */
11856
11857static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu)
11858{
11859 /* return the page table to be shadowed - in our case, EPT12 */
11860 return get_vmcs12(vcpu)->ept_pointer;
11861}
11862
11863static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
11864{
11865 WARN_ON(mmu_is_nested(vcpu));
11866
11867 vcpu->arch.mmu = &vcpu->arch.guest_mmu;
11868 kvm_init_shadow_ept_mmu(vcpu,
11869 to_vmx(vcpu)->nested.msrs.ept_caps &
11870 VMX_EPT_EXECUTE_ONLY_BIT,
11871 nested_ept_ad_enabled(vcpu),
11872 nested_ept_get_cr3(vcpu));
11873 vcpu->arch.mmu->set_cr3 = vmx_set_cr3;
11874 vcpu->arch.mmu->get_cr3 = nested_ept_get_cr3;
11875 vcpu->arch.mmu->inject_page_fault = nested_ept_inject_page_fault;
11876 vcpu->arch.mmu->get_pdptr = kvm_pdptr_read;
11877
11878 vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu;
11879}
11880
11881static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu)
11882{
11883 vcpu->arch.mmu = &vcpu->arch.root_mmu;
11884 vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
11885}
11886
11887static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
11888 u16 error_code)
11889{
11890 bool inequality, bit;
11891
11892 bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0;
11893 inequality =
11894 (error_code & vmcs12->page_fault_error_code_mask) !=
11895 vmcs12->page_fault_error_code_match;
11896 return inequality ^ bit;
11897}
11898
11899static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
11900 struct x86_exception *fault)
11901{
11902 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
11903
11904 WARN_ON(!is_guest_mode(vcpu));
11905
11906 if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code) &&
11907 !to_vmx(vcpu)->nested.nested_run_pending) {
11908 vmcs12->vm_exit_intr_error_code = fault->error_code;
11909 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
11910 PF_VECTOR | INTR_TYPE_HARD_EXCEPTION |
11911 INTR_INFO_DELIVER_CODE_MASK | INTR_INFO_VALID_MASK,
11912 fault->address);
11913 } else {
11914 kvm_inject_page_fault(vcpu, fault);
11915 }
11916}
11917
11918static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
11919 struct vmcs12 *vmcs12);
11920
11921static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
11922{
11923 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
11924 struct vcpu_vmx *vmx = to_vmx(vcpu);
11925 struct page *page;
11926 u64 hpa;
11927
11928 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
11929 /*
11930 * Translate L1 physical address to host physical
11931 * address for vmcs02. Keep the page pinned, so this
11932 * physical address remains valid. We keep a reference
11933 * to it so we can release it later.
11934 */
11935 if (vmx->nested.apic_access_page) { /* shouldn't happen */
11936 kvm_release_page_dirty(vmx->nested.apic_access_page);
11937 vmx->nested.apic_access_page = NULL;
11938 }
11939 page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->apic_access_addr);
11940 /*
11941 * If translation failed, no matter: This feature asks
11942 * to exit when accessing the given address, and if it
11943 * can never be accessed, this feature won't do
11944 * anything anyway.
11945 */
11946 if (!is_error_page(page)) {
11947 vmx->nested.apic_access_page = page;
11948 hpa = page_to_phys(vmx->nested.apic_access_page);
11949 vmcs_write64(APIC_ACCESS_ADDR, hpa);
11950 } else {
11951 vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL,
11952 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
11953 }
11954 }
11955
11956 if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
11957 if (vmx->nested.virtual_apic_page) { /* shouldn't happen */
11958 kvm_release_page_dirty(vmx->nested.virtual_apic_page);
11959 vmx->nested.virtual_apic_page = NULL;
11960 }
11961 page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->virtual_apic_page_addr);
11962
11963 /*
11964 * If translation failed, VM entry will fail because
11965 * prepare_vmcs02 set VIRTUAL_APIC_PAGE_ADDR to -1ull.
11966 * Failing the vm entry is _not_ what the processor
11967 * does but it's basically the only possibility we
11968 * have. We could still enter the guest if CR8 load
11969 * exits are enabled, CR8 store exits are enabled, and
11970 * virtualize APIC access is disabled; in this case
11971 * the processor would never use the TPR shadow and we
11972 * could simply clear the bit from the execution
11973 * control. But such a configuration is useless, so
11974 * let's keep the code simple.
11975 */
11976 if (!is_error_page(page)) {
11977 vmx->nested.virtual_apic_page = page;
11978 hpa = page_to_phys(vmx->nested.virtual_apic_page);
11979 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, hpa);
11980 }
11981 }
11982
11983 if (nested_cpu_has_posted_intr(vmcs12)) {
11984 if (vmx->nested.pi_desc_page) { /* shouldn't happen */
11985 kunmap(vmx->nested.pi_desc_page);
11986 kvm_release_page_dirty(vmx->nested.pi_desc_page);
11987 vmx->nested.pi_desc_page = NULL;
11988 vmx->nested.pi_desc = NULL;
11989 vmcs_write64(POSTED_INTR_DESC_ADDR, -1ull);
11990 }
11991 page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->posted_intr_desc_addr);
11992 if (is_error_page(page))
11993 return;
11994 vmx->nested.pi_desc_page = page;
11995 vmx->nested.pi_desc = kmap(vmx->nested.pi_desc_page);
11996 vmx->nested.pi_desc =
11997 (struct pi_desc *)((void *)vmx->nested.pi_desc +
11998 (unsigned long)(vmcs12->posted_intr_desc_addr &
11999 (PAGE_SIZE - 1)));
12000 vmcs_write64(POSTED_INTR_DESC_ADDR,
12001 page_to_phys(vmx->nested.pi_desc_page) +
12002 (unsigned long)(vmcs12->posted_intr_desc_addr &
12003 (PAGE_SIZE - 1)));
12004 }
12005 if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12))
12006 vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL,
12007 CPU_BASED_USE_MSR_BITMAPS);
12008 else
12009 vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
12010 CPU_BASED_USE_MSR_BITMAPS);
12011}
12012
12013static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu)
12014{
12015 u64 preemption_timeout = get_vmcs12(vcpu)->vmx_preemption_timer_value;
12016 struct vcpu_vmx *vmx = to_vmx(vcpu);
12017
12018 /*
12019 * A timer value of zero is architecturally guaranteed to cause
12020 * a VMExit prior to executing any instructions in the guest.
12021 */
12022 if (preemption_timeout == 0) {
12023 vmx_preemption_timer_fn(&vmx->nested.preemption_timer);
12024 return;
12025 }
12026
12027 if (vcpu->arch.virtual_tsc_khz == 0)
12028 return;
12029
12030 preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
12031 preemption_timeout *= 1000000;
12032 do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz);
12033 hrtimer_start(&vmx->nested.preemption_timer,
12034 ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL);
12035}
12036
12037static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu,
12038 struct vmcs12 *vmcs12)
12039{
12040 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
12041 return 0;
12042
12043 if (!page_address_valid(vcpu, vmcs12->io_bitmap_a) ||
12044 !page_address_valid(vcpu, vmcs12->io_bitmap_b))
12045 return -EINVAL;
12046
12047 return 0;
12048}
12049
12050static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu,
12051 struct vmcs12 *vmcs12)
12052{
12053 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
12054 return 0;
12055
12056 if (!page_address_valid(vcpu, vmcs12->msr_bitmap))
12057 return -EINVAL;
12058
12059 return 0;
12060}
12061
12062static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu,
12063 struct vmcs12 *vmcs12)
12064{
12065 if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
12066 return 0;
12067
12068 if (!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr))
12069 return -EINVAL;
12070
12071 return 0;
12072}
12073
12074/*
12075 * Merge L0's and L1's MSR bitmap, return false to indicate that
12076 * we do not use the hardware.
12077 */
12078static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
12079 struct vmcs12 *vmcs12)
12080{
12081 int msr;
12082 struct page *page;
12083 unsigned long *msr_bitmap_l1;
12084 unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap;
12085 /*
12086 * pred_cmd & spec_ctrl are trying to verify two things:
12087 *
12088 * 1. L0 gave a permission to L1 to actually passthrough the MSR. This
12089 * ensures that we do not accidentally generate an L02 MSR bitmap
12090 * from the L12 MSR bitmap that is too permissive.
12091 * 2. That L1 or L2s have actually used the MSR. This avoids
12092 * unnecessarily merging of the bitmap if the MSR is unused. This
12093 * works properly because we only update the L01 MSR bitmap lazily.
12094 * So even if L0 should pass L1 these MSRs, the L01 bitmap is only
12095 * updated to reflect this when L1 (or its L2s) actually write to
12096 * the MSR.
12097 */
12098 bool pred_cmd = !msr_write_intercepted_l01(vcpu, MSR_IA32_PRED_CMD);
12099 bool spec_ctrl = !msr_write_intercepted_l01(vcpu, MSR_IA32_SPEC_CTRL);
12100
12101 /* Nothing to do if the MSR bitmap is not in use. */
12102 if (!cpu_has_vmx_msr_bitmap() ||
12103 !nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
12104 return false;
12105
12106 if (!nested_cpu_has_virt_x2apic_mode(vmcs12) &&
12107 !pred_cmd && !spec_ctrl)
12108 return false;
12109
12110 page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->msr_bitmap);
12111 if (is_error_page(page))
12112 return false;
12113
12114 msr_bitmap_l1 = (unsigned long *)kmap(page);
12115 if (nested_cpu_has_apic_reg_virt(vmcs12)) {
12116 /*
12117 * L0 need not intercept reads for MSRs between 0x800 and 0x8ff, it
12118 * just lets the processor take the value from the virtual-APIC page;
12119 * take those 256 bits directly from the L1 bitmap.
12120 */
12121 for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
12122 unsigned word = msr / BITS_PER_LONG;
12123 msr_bitmap_l0[word] = msr_bitmap_l1[word];
12124 msr_bitmap_l0[word + (0x800 / sizeof(long))] = ~0;
12125 }
12126 } else {
12127 for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
12128 unsigned word = msr / BITS_PER_LONG;
12129 msr_bitmap_l0[word] = ~0;
12130 msr_bitmap_l0[word + (0x800 / sizeof(long))] = ~0;
12131 }
12132 }
12133
12134 nested_vmx_disable_intercept_for_msr(
12135 msr_bitmap_l1, msr_bitmap_l0,
12136 X2APIC_MSR(APIC_TASKPRI),
12137 MSR_TYPE_W);
12138
12139 if (nested_cpu_has_vid(vmcs12)) {
12140 nested_vmx_disable_intercept_for_msr(
12141 msr_bitmap_l1, msr_bitmap_l0,
12142 X2APIC_MSR(APIC_EOI),
12143 MSR_TYPE_W);
12144 nested_vmx_disable_intercept_for_msr(
12145 msr_bitmap_l1, msr_bitmap_l0,
12146 X2APIC_MSR(APIC_SELF_IPI),
12147 MSR_TYPE_W);
12148 }
12149
12150 if (spec_ctrl)
12151 nested_vmx_disable_intercept_for_msr(
12152 msr_bitmap_l1, msr_bitmap_l0,
12153 MSR_IA32_SPEC_CTRL,
12154 MSR_TYPE_R | MSR_TYPE_W);
12155
12156 if (pred_cmd)
12157 nested_vmx_disable_intercept_for_msr(
12158 msr_bitmap_l1, msr_bitmap_l0,
12159 MSR_IA32_PRED_CMD,
12160 MSR_TYPE_W);
12161
12162 kunmap(page);
12163 kvm_release_page_clean(page);
12164
12165 return true;
12166}
12167
12168static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu,
12169 struct vmcs12 *vmcs12)
12170{
12171 struct vmcs12 *shadow;
12172 struct page *page;
12173
12174 if (!nested_cpu_has_shadow_vmcs(vmcs12) ||
12175 vmcs12->vmcs_link_pointer == -1ull)
12176 return;
12177
12178 shadow = get_shadow_vmcs12(vcpu);
12179 page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->vmcs_link_pointer);
12180
12181 memcpy(shadow, kmap(page), VMCS12_SIZE);
12182
12183 kunmap(page);
12184 kvm_release_page_clean(page);
12185}
12186
12187static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu,
12188 struct vmcs12 *vmcs12)
12189{
12190 struct vcpu_vmx *vmx = to_vmx(vcpu);
12191
12192 if (!nested_cpu_has_shadow_vmcs(vmcs12) ||
12193 vmcs12->vmcs_link_pointer == -1ull)
12194 return;
12195
12196 kvm_write_guest(vmx->vcpu.kvm, vmcs12->vmcs_link_pointer,
12197 get_shadow_vmcs12(vcpu), VMCS12_SIZE);
12198}
12199
12200static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu,
12201 struct vmcs12 *vmcs12)
12202{
12203 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) &&
12204 !page_address_valid(vcpu, vmcs12->apic_access_addr))
12205 return -EINVAL;
12206 else
12207 return 0;
12208}
12209
12210static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
12211 struct vmcs12 *vmcs12)
12212{
12213 if (!nested_cpu_has_virt_x2apic_mode(vmcs12) &&
12214 !nested_cpu_has_apic_reg_virt(vmcs12) &&
12215 !nested_cpu_has_vid(vmcs12) &&
12216 !nested_cpu_has_posted_intr(vmcs12))
12217 return 0;
12218
12219 /*
12220 * If virtualize x2apic mode is enabled,
12221 * virtualize apic access must be disabled.
12222 */
12223 if (nested_cpu_has_virt_x2apic_mode(vmcs12) &&
12224 nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
12225 return -EINVAL;
12226
12227 /*
12228 * If virtual interrupt delivery is enabled,
12229 * we must exit on external interrupts.
12230 */
12231 if (nested_cpu_has_vid(vmcs12) &&
12232 !nested_exit_on_intr(vcpu))
12233 return -EINVAL;
12234
12235 /*
12236 * bits 15:8 should be zero in posted_intr_nv,
12237 * the descriptor address has been already checked
12238 * in nested_get_vmcs12_pages.
12239 *
12240 * bits 5:0 of posted_intr_desc_addr should be zero.
12241 */
12242 if (nested_cpu_has_posted_intr(vmcs12) &&
12243 (!nested_cpu_has_vid(vmcs12) ||
12244 !nested_exit_intr_ack_set(vcpu) ||
12245 (vmcs12->posted_intr_nv & 0xff00) ||
12246 (vmcs12->posted_intr_desc_addr & 0x3f) ||
12247 (vmcs12->posted_intr_desc_addr >> cpuid_maxphyaddr(vcpu))))
12248 return -EINVAL;
12249
12250 /* tpr shadow is needed by all apicv features. */
12251 if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
12252 return -EINVAL;
12253
12254 return 0;
12255}
12256
12257static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu,
12258 unsigned long count_field,
12259 unsigned long addr_field)
12260{
12261 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
12262 int maxphyaddr;
12263 u64 count, addr;
12264
12265 if (vmcs12_read_any(vmcs12, count_field, &count) ||
12266 vmcs12_read_any(vmcs12, addr_field, &addr)) {
12267 WARN_ON(1);
12268 return -EINVAL;
12269 }
12270 if (count == 0)
12271 return 0;
12272 maxphyaddr = cpuid_maxphyaddr(vcpu);
12273 if (!IS_ALIGNED(addr, 16) || addr >> maxphyaddr ||
12274 (addr + count * sizeof(struct vmx_msr_entry) - 1) >> maxphyaddr) {
12275 pr_debug_ratelimited(
12276 "nVMX: invalid MSR switch (0x%lx, %d, %llu, 0x%08llx)",
12277 addr_field, maxphyaddr, count, addr);
12278 return -EINVAL;
12279 }
12280 return 0;
12281}
12282
12283static int nested_vmx_check_msr_switch_controls(struct kvm_vcpu *vcpu,
12284 struct vmcs12 *vmcs12)
12285{
12286 if (vmcs12->vm_exit_msr_load_count == 0 &&
12287 vmcs12->vm_exit_msr_store_count == 0 &&
12288 vmcs12->vm_entry_msr_load_count == 0)
12289 return 0; /* Fast path */
12290 if (nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_LOAD_COUNT,
12291 VM_EXIT_MSR_LOAD_ADDR) ||
12292 nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_STORE_COUNT,
12293 VM_EXIT_MSR_STORE_ADDR) ||
12294 nested_vmx_check_msr_switch(vcpu, VM_ENTRY_MSR_LOAD_COUNT,
12295 VM_ENTRY_MSR_LOAD_ADDR))
12296 return -EINVAL;
12297 return 0;
12298}
12299
12300static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu,
12301 struct vmcs12 *vmcs12)
12302{
12303 if (!nested_cpu_has_pml(vmcs12))
12304 return 0;
12305
12306 if (!nested_cpu_has_ept(vmcs12) ||
12307 !page_address_valid(vcpu, vmcs12->pml_address))
12308 return -EINVAL;
12309
12310 return 0;
12311}
12312
12313static int nested_vmx_check_shadow_vmcs_controls(struct kvm_vcpu *vcpu,
12314 struct vmcs12 *vmcs12)
12315{
12316 if (!nested_cpu_has_shadow_vmcs(vmcs12))
12317 return 0;
12318
12319 if (!page_address_valid(vcpu, vmcs12->vmread_bitmap) ||
12320 !page_address_valid(vcpu, vmcs12->vmwrite_bitmap))
12321 return -EINVAL;
12322
12323 return 0;
12324}
12325
12326static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu,
12327 struct vmx_msr_entry *e)
12328{
12329 /* x2APIC MSR accesses are not allowed */
12330 if (vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8)
12331 return -EINVAL;
12332 if (e->index == MSR_IA32_UCODE_WRITE || /* SDM Table 35-2 */
12333 e->index == MSR_IA32_UCODE_REV)
12334 return -EINVAL;
12335 if (e->reserved != 0)
12336 return -EINVAL;
12337 return 0;
12338}
12339
12340static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu,
12341 struct vmx_msr_entry *e)
12342{
12343 if (e->index == MSR_FS_BASE ||
12344 e->index == MSR_GS_BASE ||
12345 e->index == MSR_IA32_SMM_MONITOR_CTL || /* SMM is not supported */
12346 nested_vmx_msr_check_common(vcpu, e))
12347 return -EINVAL;
12348 return 0;
12349}
12350
12351static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu,
12352 struct vmx_msr_entry *e)
12353{
12354 if (e->index == MSR_IA32_SMBASE || /* SMM is not supported */
12355 nested_vmx_msr_check_common(vcpu, e))
12356 return -EINVAL;
12357 return 0;
12358}
12359
12360/*
12361 * Load guest's/host's msr at nested entry/exit.
12362 * return 0 for success, entry index for failure.
12363 */
12364static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
12365{
12366 u32 i;
12367 struct vmx_msr_entry e;
12368 struct msr_data msr;
12369
12370 msr.host_initiated = false;
12371 for (i = 0; i < count; i++) {
12372 if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e),
12373 &e, sizeof(e))) {
12374 pr_debug_ratelimited(
12375 "%s cannot read MSR entry (%u, 0x%08llx)\n",
12376 __func__, i, gpa + i * sizeof(e));
12377 goto fail;
12378 }
12379 if (nested_vmx_load_msr_check(vcpu, &e)) {
12380 pr_debug_ratelimited(
12381 "%s check failed (%u, 0x%x, 0x%x)\n",
12382 __func__, i, e.index, e.reserved);
12383 goto fail;
12384 }
12385 msr.index = e.index;
12386 msr.data = e.value;
12387 if (kvm_set_msr(vcpu, &msr)) {
12388 pr_debug_ratelimited(
12389 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
12390 __func__, i, e.index, e.value);
12391 goto fail;
12392 }
12393 }
12394 return 0;
12395fail:
12396 return i + 1;
12397}
12398
12399static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
12400{
12401 u32 i;
12402 struct vmx_msr_entry e;
12403
12404 for (i = 0; i < count; i++) {
12405 struct msr_data msr_info;
12406 if (kvm_vcpu_read_guest(vcpu,
12407 gpa + i * sizeof(e),
12408 &e, 2 * sizeof(u32))) {
12409 pr_debug_ratelimited(
12410 "%s cannot read MSR entry (%u, 0x%08llx)\n",
12411 __func__, i, gpa + i * sizeof(e));
12412 return -EINVAL;
12413 }
12414 if (nested_vmx_store_msr_check(vcpu, &e)) {
12415 pr_debug_ratelimited(
12416 "%s check failed (%u, 0x%x, 0x%x)\n",
12417 __func__, i, e.index, e.reserved);
12418 return -EINVAL;
12419 }
12420 msr_info.host_initiated = false;
12421 msr_info.index = e.index;
12422 if (kvm_get_msr(vcpu, &msr_info)) {
12423 pr_debug_ratelimited(
12424 "%s cannot read MSR (%u, 0x%x)\n",
12425 __func__, i, e.index);
12426 return -EINVAL;
12427 }
12428 if (kvm_vcpu_write_guest(vcpu,
12429 gpa + i * sizeof(e) +
12430 offsetof(struct vmx_msr_entry, value),
12431 &msr_info.data, sizeof(msr_info.data))) {
12432 pr_debug_ratelimited(
12433 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
12434 __func__, i, e.index, msr_info.data);
12435 return -EINVAL;
12436 }
12437 }
12438 return 0;
12439}
12440
12441static bool nested_cr3_valid(struct kvm_vcpu *vcpu, unsigned long val)
12442{
12443 unsigned long invalid_mask;
12444
12445 invalid_mask = (~0ULL) << cpuid_maxphyaddr(vcpu);
12446 return (val & invalid_mask) == 0;
12447}
12448
12449/*
12450 * Load guest's/host's cr3 at nested entry/exit. nested_ept is true if we are
12451 * emulating VM entry into a guest with EPT enabled.
12452 * Returns 0 on success, 1 on failure. Invalid state exit qualification code
12453 * is assigned to entry_failure_code on failure.
12454 */
12455static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool nested_ept,
12456 u32 *entry_failure_code)
12457{
12458 if (cr3 != kvm_read_cr3(vcpu) || (!nested_ept && pdptrs_changed(vcpu))) {
12459 if (!nested_cr3_valid(vcpu, cr3)) {
12460 *entry_failure_code = ENTRY_FAIL_DEFAULT;
12461 return 1;
12462 }
12463
12464 /*
12465 * If PAE paging and EPT are both on, CR3 is not used by the CPU and
12466 * must not be dereferenced.
12467 */
12468 if (!is_long_mode(vcpu) && is_pae(vcpu) && is_paging(vcpu) &&
12469 !nested_ept) {
12470 if (!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) {
12471 *entry_failure_code = ENTRY_FAIL_PDPTE;
12472 return 1;
12473 }
12474 }
12475 }
12476
12477 if (!nested_ept)
12478 kvm_mmu_new_cr3(vcpu, cr3, false);
12479
12480 vcpu->arch.cr3 = cr3;
12481 __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
12482
12483 kvm_init_mmu(vcpu, false);
12484
12485 return 0;
12486}
12487
12488/*
12489 * Returns if KVM is able to config CPU to tag TLB entries
12490 * populated by L2 differently than TLB entries populated
12491 * by L1.
12492 *
12493 * If L1 uses EPT, then TLB entries are tagged with different EPTP.
12494 *
12495 * If L1 uses VPID and we allocated a vpid02, TLB entries are tagged
12496 * with different VPID (L1 entries are tagged with vmx->vpid
12497 * while L2 entries are tagged with vmx->nested.vpid02).
12498 */
12499static bool nested_has_guest_tlb_tag(struct kvm_vcpu *vcpu)
12500{
12501 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
12502
12503 return nested_cpu_has_ept(vmcs12) ||
12504 (nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02);
12505}
12506
12507static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
12508{
12509 if (vmx->nested.nested_run_pending &&
12510 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER))
12511 return vmcs12->guest_ia32_efer;
12512 else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE)
12513 return vmx->vcpu.arch.efer | (EFER_LMA | EFER_LME);
12514 else
12515 return vmx->vcpu.arch.efer & ~(EFER_LMA | EFER_LME);
12516}
12517
12518static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx)
12519{
12520 /*
12521 * If vmcs02 hasn't been initialized, set the constant vmcs02 state
12522 * according to L0's settings (vmcs12 is irrelevant here). Host
12523 * fields that come from L0 and are not constant, e.g. HOST_CR3,
12524 * will be set as needed prior to VMLAUNCH/VMRESUME.
12525 */
12526 if (vmx->nested.vmcs02_initialized)
12527 return;
12528 vmx->nested.vmcs02_initialized = true;
12529
12530 /*
12531 * We don't care what the EPTP value is we just need to guarantee
12532 * it's valid so we don't get a false positive when doing early
12533 * consistency checks.
12534 */
12535 if (enable_ept && nested_early_check)
12536 vmcs_write64(EPT_POINTER, construct_eptp(&vmx->vcpu, 0));
12537
12538 /* All VMFUNCs are currently emulated through L0 vmexits. */
12539 if (cpu_has_vmx_vmfunc())
12540 vmcs_write64(VM_FUNCTION_CONTROL, 0);
12541
12542 if (cpu_has_vmx_posted_intr())
12543 vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR);
12544
12545 if (cpu_has_vmx_msr_bitmap())
12546 vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap));
12547
12548 if (enable_pml)
12549 vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
12550
12551 /*
12552 * Set the MSR load/store lists to match L0's settings. Only the
12553 * addresses are constant (for vmcs02), the counts can change based
12554 * on L2's behavior, e.g. switching to/from long mode.
12555 */
12556 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
12557 vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val));
12558 vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val));
12559
12560 vmx_set_constant_host_state(vmx);
12561}
12562
12563static void prepare_vmcs02_early_full(struct vcpu_vmx *vmx,
12564 struct vmcs12 *vmcs12)
12565{
12566 prepare_vmcs02_constant_state(vmx);
12567
12568 vmcs_write64(VMCS_LINK_POINTER, -1ull);
12569
12570 if (enable_vpid) {
12571 if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02)
12572 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02);
12573 else
12574 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
12575 }
12576}
12577
12578static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
12579{
12580 u32 exec_control, vmcs12_exec_ctrl;
12581 u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12);
12582
12583 if (vmx->nested.dirty_vmcs12 || vmx->nested.hv_evmcs)
12584 prepare_vmcs02_early_full(vmx, vmcs12);
12585
12586 /*
12587 * HOST_RSP is normally set correctly in vmx_vcpu_run() just before
12588 * entry, but only if the current (host) sp changed from the value
12589 * we wrote last (vmx->host_rsp). This cache is no longer relevant
12590 * if we switch vmcs, and rather than hold a separate cache per vmcs,
12591 * here we just force the write to happen on entry. host_rsp will
12592 * also be written unconditionally by nested_vmx_check_vmentry_hw()
12593 * if we are doing early consistency checks via hardware.
12594 */
12595 vmx->host_rsp = 0;
12596
12597 /*
12598 * PIN CONTROLS
12599 */
12600 exec_control = vmcs12->pin_based_vm_exec_control;
12601
12602 /* Preemption timer setting is computed directly in vmx_vcpu_run. */
12603 exec_control |= vmcs_config.pin_based_exec_ctrl;
12604 exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
12605 vmx->loaded_vmcs->hv_timer_armed = false;
12606
12607 /* Posted interrupts setting is only taken from vmcs12. */
12608 if (nested_cpu_has_posted_intr(vmcs12)) {
12609 vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv;
12610 vmx->nested.pi_pending = false;
12611 } else {
12612 exec_control &= ~PIN_BASED_POSTED_INTR;
12613 }
12614 vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, exec_control);
12615
12616 /*
12617 * EXEC CONTROLS
12618 */
12619 exec_control = vmx_exec_control(vmx); /* L0's desires */
12620 exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
12621 exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
12622 exec_control &= ~CPU_BASED_TPR_SHADOW;
12623 exec_control |= vmcs12->cpu_based_vm_exec_control;
12624
12625 /*
12626 * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR. Later, if
12627 * nested_get_vmcs12_pages can't fix it up, the illegal value
12628 * will result in a VM entry failure.
12629 */
12630 if (exec_control & CPU_BASED_TPR_SHADOW) {
12631 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull);
12632 vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);
12633 } else {
12634#ifdef CONFIG_X86_64
12635 exec_control |= CPU_BASED_CR8_LOAD_EXITING |
12636 CPU_BASED_CR8_STORE_EXITING;
12637#endif
12638 }
12639
12640 /*
12641 * A vmexit (to either L1 hypervisor or L0 userspace) is always needed
12642 * for I/O port accesses.
12643 */
12644 exec_control &= ~CPU_BASED_USE_IO_BITMAPS;
12645 exec_control |= CPU_BASED_UNCOND_IO_EXITING;
12646 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
12647
12648 /*
12649 * SECONDARY EXEC CONTROLS
12650 */
12651 if (cpu_has_secondary_exec_ctrls()) {
12652 exec_control = vmx->secondary_exec_control;
12653
12654 /* Take the following fields only from vmcs12 */
12655 exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
12656 SECONDARY_EXEC_ENABLE_INVPCID |
12657 SECONDARY_EXEC_RDTSCP |
12658 SECONDARY_EXEC_XSAVES |
12659 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
12660 SECONDARY_EXEC_APIC_REGISTER_VIRT |
12661 SECONDARY_EXEC_ENABLE_VMFUNC);
12662 if (nested_cpu_has(vmcs12,
12663 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) {
12664 vmcs12_exec_ctrl = vmcs12->secondary_vm_exec_control &
12665 ~SECONDARY_EXEC_ENABLE_PML;
12666 exec_control |= vmcs12_exec_ctrl;
12667 }
12668
12669 /* VMCS shadowing for L2 is emulated for now */
12670 exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
12671
12672 if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
12673 vmcs_write16(GUEST_INTR_STATUS,
12674 vmcs12->guest_intr_status);
12675
12676 /*
12677 * Write an illegal value to APIC_ACCESS_ADDR. Later,
12678 * nested_get_vmcs12_pages will either fix it up or
12679 * remove the VM execution control.
12680 */
12681 if (exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)
12682 vmcs_write64(APIC_ACCESS_ADDR, -1ull);
12683
12684 if (exec_control & SECONDARY_EXEC_ENCLS_EXITING)
12685 vmcs_write64(ENCLS_EXITING_BITMAP, -1ull);
12686
12687 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
12688 }
12689
12690 /*
12691 * ENTRY CONTROLS
12692 *
12693 * vmcs12's VM_{ENTRY,EXIT}_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE
12694 * are emulated by vmx_set_efer() in prepare_vmcs02(), but speculate
12695 * on the related bits (if supported by the CPU) in the hope that
12696 * we can avoid VMWrites during vmx_set_efer().
12697 */
12698 exec_control = (vmcs12->vm_entry_controls | vmcs_config.vmentry_ctrl) &
12699 ~VM_ENTRY_IA32E_MODE & ~VM_ENTRY_LOAD_IA32_EFER;
12700 if (cpu_has_load_ia32_efer) {
12701 if (guest_efer & EFER_LMA)
12702 exec_control |= VM_ENTRY_IA32E_MODE;
12703 if (guest_efer != host_efer)
12704 exec_control |= VM_ENTRY_LOAD_IA32_EFER;
12705 }
12706 vm_entry_controls_init(vmx, exec_control);
12707
12708 /*
12709 * EXIT CONTROLS
12710 *
12711 * L2->L1 exit controls are emulated - the hardware exit is to L0 so
12712 * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
12713 * bits may be modified by vmx_set_efer() in prepare_vmcs02().
12714 */
12715 exec_control = vmcs_config.vmexit_ctrl;
12716 if (cpu_has_load_ia32_efer && guest_efer != host_efer)
12717 exec_control |= VM_EXIT_LOAD_IA32_EFER;
12718 vm_exit_controls_init(vmx, exec_control);
12719
12720 /*
12721 * Conceptually we want to copy the PML address and index from
12722 * vmcs01 here, and then back to vmcs01 on nested vmexit. But,
12723 * since we always flush the log on each vmexit and never change
12724 * the PML address (once set), this happens to be equivalent to
12725 * simply resetting the index in vmcs02.
12726 */
12727 if (enable_pml)
12728 vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
12729
12730 /*
12731 * Interrupt/Exception Fields
12732 */
12733 if (vmx->nested.nested_run_pending) {
12734 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
12735 vmcs12->vm_entry_intr_info_field);
12736 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
12737 vmcs12->vm_entry_exception_error_code);
12738 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
12739 vmcs12->vm_entry_instruction_len);
12740 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
12741 vmcs12->guest_interruptibility_info);
12742 vmx->loaded_vmcs->nmi_known_unmasked =
12743 !(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI);
12744 } else {
12745 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
12746 }
12747}
12748
12749static void prepare_vmcs02_full(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
12750{
12751 struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs;
12752
12753 if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
12754 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) {
12755 vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
12756 vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
12757 vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector);
12758 vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector);
12759 vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector);
12760 vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector);
12761 vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector);
12762 vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector);
12763 vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit);
12764 vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit);
12765 vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit);
12766 vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit);
12767 vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit);
12768 vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit);
12769 vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit);
12770 vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit);
12771 vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit);
12772 vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit);
12773 vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes);
12774 vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes);
12775 vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes);
12776 vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes);
12777 vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes);
12778 vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes);
12779 vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base);
12780 vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base);
12781 vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base);
12782 vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base);
12783 vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base);
12784 vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base);
12785 vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base);
12786 vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base);
12787 vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base);
12788 vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base);
12789 }
12790
12791 if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
12792 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1)) {
12793 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs);
12794 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
12795 vmcs12->guest_pending_dbg_exceptions);
12796 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp);
12797 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip);
12798
12799 /*
12800 * L1 may access the L2's PDPTR, so save them to construct
12801 * vmcs12
12802 */
12803 if (enable_ept) {
12804 vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
12805 vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
12806 vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
12807 vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
12808 }
12809 }
12810
12811 if (nested_cpu_has_xsaves(vmcs12))
12812 vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap);
12813
12814 /*
12815 * Whether page-faults are trapped is determined by a combination of
12816 * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF.
12817 * If enable_ept, L0 doesn't care about page faults and we should
12818 * set all of these to L1's desires. However, if !enable_ept, L0 does
12819 * care about (at least some) page faults, and because it is not easy
12820 * (if at all possible?) to merge L0 and L1's desires, we simply ask
12821 * to exit on each and every L2 page fault. This is done by setting
12822 * MASK=MATCH=0 and (see below) EB.PF=1.
12823 * Note that below we don't need special code to set EB.PF beyond the
12824 * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept,
12825 * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when
12826 * !enable_ept, EB.PF is 1, so the "or" will always be 1.
12827 */
12828 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK,
12829 enable_ept ? vmcs12->page_fault_error_code_mask : 0);
12830 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH,
12831 enable_ept ? vmcs12->page_fault_error_code_match : 0);
12832
12833 if (cpu_has_vmx_apicv()) {
12834 vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0);
12835 vmcs_write64(EOI_EXIT_BITMAP1, vmcs12->eoi_exit_bitmap1);
12836 vmcs_write64(EOI_EXIT_BITMAP2, vmcs12->eoi_exit_bitmap2);
12837 vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3);
12838 }
12839
12840 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
12841 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
12842
12843 set_cr4_guest_host_mask(vmx);
12844
12845 if (kvm_mpx_supported()) {
12846 if (vmx->nested.nested_run_pending &&
12847 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
12848 vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
12849 else
12850 vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs);
12851 }
12852}
12853
12854/*
12855 * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
12856 * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
12857 * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2
12858 * guest in a way that will both be appropriate to L1's requests, and our
12859 * needs. In addition to modifying the active vmcs (which is vmcs02), this
12860 * function also has additional necessary side-effects, like setting various
12861 * vcpu->arch fields.
12862 * Returns 0 on success, 1 on failure. Invalid state exit qualification code
12863 * is assigned to entry_failure_code on failure.
12864 */
12865static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
12866 u32 *entry_failure_code)
12867{
12868 struct vcpu_vmx *vmx = to_vmx(vcpu);
12869 struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs;
12870
12871 if (vmx->nested.dirty_vmcs12 || vmx->nested.hv_evmcs) {
12872 prepare_vmcs02_full(vmx, vmcs12);
12873 vmx->nested.dirty_vmcs12 = false;
12874 }
12875
12876 /*
12877 * First, the fields that are shadowed. This must be kept in sync
12878 * with vmx_shadow_fields.h.
12879 */
12880 if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
12881 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) {
12882 vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes);
12883 vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes);
12884 }
12885
12886 if (vmx->nested.nested_run_pending &&
12887 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) {
12888 kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
12889 vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl);
12890 } else {
12891 kvm_set_dr(vcpu, 7, vcpu->arch.dr7);
12892 vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl);
12893 }
12894 vmx_set_rflags(vcpu, vmcs12->guest_rflags);
12895
12896 vmx->nested.preemption_timer_expired = false;
12897 if (nested_cpu_has_preemption_timer(vmcs12))
12898 vmx_start_preemption_timer(vcpu);
12899
12900 /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the
12901 * bitwise-or of what L1 wants to trap for L2, and what we want to
12902 * trap. Note that CR0.TS also needs updating - we do this later.
12903 */
12904 update_exception_bitmap(vcpu);
12905 vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask;
12906 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
12907
12908 if (vmx->nested.nested_run_pending &&
12909 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) {
12910 vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat);
12911 vcpu->arch.pat = vmcs12->guest_ia32_pat;
12912 } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
12913 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
12914 }
12915
12916 vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
12917
12918 if (kvm_has_tsc_control)
12919 decache_tsc_multiplier(vmx);
12920
12921 if (enable_vpid) {
12922 /*
12923 * There is no direct mapping between vpid02 and vpid12, the
12924 * vpid02 is per-vCPU for L0 and reused while the value of
12925 * vpid12 is changed w/ one invvpid during nested vmentry.
12926 * The vpid12 is allocated by L1 for L2, so it will not
12927 * influence global bitmap(for vpid01 and vpid02 allocation)
12928 * even if spawn a lot of nested vCPUs.
12929 */
12930 if (nested_cpu_has_vpid(vmcs12) && nested_has_guest_tlb_tag(vcpu)) {
12931 if (vmcs12->virtual_processor_id != vmx->nested.last_vpid) {
12932 vmx->nested.last_vpid = vmcs12->virtual_processor_id;
12933 __vmx_flush_tlb(vcpu, nested_get_vpid02(vcpu), false);
12934 }
12935 } else {
12936 /*
12937 * If L1 use EPT, then L0 needs to execute INVEPT on
12938 * EPTP02 instead of EPTP01. Therefore, delay TLB
12939 * flush until vmcs02->eptp is fully updated by
12940 * KVM_REQ_LOAD_CR3. Note that this assumes
12941 * KVM_REQ_TLB_FLUSH is evaluated after
12942 * KVM_REQ_LOAD_CR3 in vcpu_enter_guest().
12943 */
12944 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
12945 }
12946 }
12947
12948 if (nested_cpu_has_ept(vmcs12))
12949 nested_ept_init_mmu_context(vcpu);
12950 else if (nested_cpu_has2(vmcs12,
12951 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
12952 vmx_flush_tlb(vcpu, true);
12953
12954 /*
12955 * This sets GUEST_CR0 to vmcs12->guest_cr0, possibly modifying those
12956 * bits which we consider mandatory enabled.
12957 * The CR0_READ_SHADOW is what L2 should have expected to read given
12958 * the specifications by L1; It's not enough to take
12959 * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we we
12960 * have more bits than L1 expected.
12961 */
12962 vmx_set_cr0(vcpu, vmcs12->guest_cr0);
12963 vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12));
12964
12965 vmx_set_cr4(vcpu, vmcs12->guest_cr4);
12966 vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12));
12967
12968 vcpu->arch.efer = nested_vmx_calc_efer(vmx, vmcs12);
12969 /* Note: may modify VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */
12970 vmx_set_efer(vcpu, vcpu->arch.efer);
12971
12972 /*
12973 * Guest state is invalid and unrestricted guest is disabled,
12974 * which means L1 attempted VMEntry to L2 with invalid state.
12975 * Fail the VMEntry.
12976 */
12977 if (vmx->emulation_required) {
12978 *entry_failure_code = ENTRY_FAIL_DEFAULT;
12979 return 1;
12980 }
12981
12982 /* Shadow page tables on either EPT or shadow page tables. */
12983 if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12),
12984 entry_failure_code))
12985 return 1;
12986
12987 if (!enable_ept)
12988 vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested;
12989
12990 kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->guest_rsp);
12991 kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip);
12992 return 0;
12993}
12994
12995static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12)
12996{
12997 if (!nested_cpu_has_nmi_exiting(vmcs12) &&
12998 nested_cpu_has_virtual_nmis(vmcs12))
12999 return -EINVAL;
13000
13001 if (!nested_cpu_has_virtual_nmis(vmcs12) &&
13002 nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING))
13003 return -EINVAL;
13004
13005 return 0;
13006}
13007
13008static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
13009{
13010 struct vcpu_vmx *vmx = to_vmx(vcpu);
13011 bool ia32e;
13012
13013 if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE &&
13014 vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT)
13015 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
13016
13017 if (nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id)
13018 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
13019
13020 if (nested_vmx_check_io_bitmap_controls(vcpu, vmcs12))
13021 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
13022
13023 if (nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12))
13024 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
13025
13026 if (nested_vmx_check_apic_access_controls(vcpu, vmcs12))
13027 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
13028
13029 if (nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12))
13030 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
13031
13032 if (nested_vmx_check_apicv_controls(vcpu, vmcs12))
13033 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
13034
13035 if (nested_vmx_check_msr_switch_controls(vcpu, vmcs12))
13036 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
13037
13038 if (nested_vmx_check_pml_controls(vcpu, vmcs12))
13039 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
13040
13041 if (nested_vmx_check_shadow_vmcs_controls(vcpu, vmcs12))
13042 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
13043
13044 if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control,
13045 vmx->nested.msrs.procbased_ctls_low,
13046 vmx->nested.msrs.procbased_ctls_high) ||
13047 (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) &&
13048 !vmx_control_verify(vmcs12->secondary_vm_exec_control,
13049 vmx->nested.msrs.secondary_ctls_low,
13050 vmx->nested.msrs.secondary_ctls_high)) ||
13051 !vmx_control_verify(vmcs12->pin_based_vm_exec_control,
13052 vmx->nested.msrs.pinbased_ctls_low,
13053 vmx->nested.msrs.pinbased_ctls_high) ||
13054 !vmx_control_verify(vmcs12->vm_exit_controls,
13055 vmx->nested.msrs.exit_ctls_low,
13056 vmx->nested.msrs.exit_ctls_high) ||
13057 !vmx_control_verify(vmcs12->vm_entry_controls,
13058 vmx->nested.msrs.entry_ctls_low,
13059 vmx->nested.msrs.entry_ctls_high))
13060 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
13061
13062 if (nested_vmx_check_nmi_controls(vmcs12))
13063 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
13064
13065 if (nested_cpu_has_vmfunc(vmcs12)) {
13066 if (vmcs12->vm_function_control &
13067 ~vmx->nested.msrs.vmfunc_controls)
13068 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
13069
13070 if (nested_cpu_has_eptp_switching(vmcs12)) {
13071 if (!nested_cpu_has_ept(vmcs12) ||
13072 !page_address_valid(vcpu, vmcs12->eptp_list_address))
13073 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
13074 }
13075 }
13076
13077 if (vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu))
13078 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
13079
13080 if (!nested_host_cr0_valid(vcpu, vmcs12->host_cr0) ||
13081 !nested_host_cr4_valid(vcpu, vmcs12->host_cr4) ||
13082 !nested_cr3_valid(vcpu, vmcs12->host_cr3))
13083 return VMXERR_ENTRY_INVALID_HOST_STATE_FIELD;
13084
13085 /*
13086 * If the load IA32_EFER VM-exit control is 1, bits reserved in the
13087 * IA32_EFER MSR must be 0 in the field for that register. In addition,
13088 * the values of the LMA and LME bits in the field must each be that of
13089 * the host address-space size VM-exit control.
13090 */
13091 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) {
13092 ia32e = (vmcs12->vm_exit_controls &
13093 VM_EXIT_HOST_ADDR_SPACE_SIZE) != 0;
13094 if (!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer) ||
13095 ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA) ||
13096 ia32e != !!(vmcs12->host_ia32_efer & EFER_LME))
13097 return VMXERR_ENTRY_INVALID_HOST_STATE_FIELD;
13098 }
13099
13100 /*
13101 * From the Intel SDM, volume 3:
13102 * Fields relevant to VM-entry event injection must be set properly.
13103 * These fields are the VM-entry interruption-information field, the
13104 * VM-entry exception error code, and the VM-entry instruction length.
13105 */
13106 if (vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) {
13107 u32 intr_info = vmcs12->vm_entry_intr_info_field;
13108 u8 vector = intr_info & INTR_INFO_VECTOR_MASK;
13109 u32 intr_type = intr_info & INTR_INFO_INTR_TYPE_MASK;
13110 bool has_error_code = intr_info & INTR_INFO_DELIVER_CODE_MASK;
13111 bool should_have_error_code;
13112 bool urg = nested_cpu_has2(vmcs12,
13113 SECONDARY_EXEC_UNRESTRICTED_GUEST);
13114 bool prot_mode = !urg || vmcs12->guest_cr0 & X86_CR0_PE;
13115
13116 /* VM-entry interruption-info field: interruption type */
13117 if (intr_type == INTR_TYPE_RESERVED ||
13118 (intr_type == INTR_TYPE_OTHER_EVENT &&
13119 !nested_cpu_supports_monitor_trap_flag(vcpu)))
13120 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
13121
13122 /* VM-entry interruption-info field: vector */
13123 if ((intr_type == INTR_TYPE_NMI_INTR && vector != NMI_VECTOR) ||
13124 (intr_type == INTR_TYPE_HARD_EXCEPTION && vector > 31) ||
13125 (intr_type == INTR_TYPE_OTHER_EVENT && vector != 0))
13126 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
13127
13128 /* VM-entry interruption-info field: deliver error code */
13129 should_have_error_code =
13130 intr_type == INTR_TYPE_HARD_EXCEPTION && prot_mode &&
13131 x86_exception_has_error_code(vector);
13132 if (has_error_code != should_have_error_code)
13133 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
13134
13135 /* VM-entry exception error code */
13136 if (has_error_code &&
13137 vmcs12->vm_entry_exception_error_code & GENMASK(31, 15))
13138 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
13139
13140 /* VM-entry interruption-info field: reserved bits */
13141 if (intr_info & INTR_INFO_RESVD_BITS_MASK)
13142 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
13143
13144 /* VM-entry instruction length */
13145 switch (intr_type) {
13146 case INTR_TYPE_SOFT_EXCEPTION:
13147 case INTR_TYPE_SOFT_INTR:
13148 case INTR_TYPE_PRIV_SW_EXCEPTION:
13149 if ((vmcs12->vm_entry_instruction_len > 15) ||
13150 (vmcs12->vm_entry_instruction_len == 0 &&
13151 !nested_cpu_has_zero_length_injection(vcpu)))
13152 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
13153 }
13154 }
13155
13156 if (nested_cpu_has_ept(vmcs12) &&
13157 !valid_ept_address(vcpu, vmcs12->ept_pointer))
13158 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
13159
13160 return 0;
13161}
13162
13163static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu,
13164 struct vmcs12 *vmcs12)
13165{
13166 int r;
13167 struct page *page;
13168 struct vmcs12 *shadow;
13169
13170 if (vmcs12->vmcs_link_pointer == -1ull)
13171 return 0;
13172
13173 if (!page_address_valid(vcpu, vmcs12->vmcs_link_pointer))
13174 return -EINVAL;
13175
13176 page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->vmcs_link_pointer);
13177 if (is_error_page(page))
13178 return -EINVAL;
13179
13180 r = 0;
13181 shadow = kmap(page);
13182 if (shadow->hdr.revision_id != VMCS12_REVISION ||
13183 shadow->hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12))
13184 r = -EINVAL;
13185 kunmap(page);
13186 kvm_release_page_clean(page);
13187 return r;
13188}
13189
13190static int check_vmentry_postreqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
13191 u32 *exit_qual)
13192{
13193 bool ia32e;
13194
13195 *exit_qual = ENTRY_FAIL_DEFAULT;
13196
13197 if (!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0) ||
13198 !nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4))
13199 return 1;
13200
13201 if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) {
13202 *exit_qual = ENTRY_FAIL_VMCS_LINK_PTR;
13203 return 1;
13204 }
13205
13206 /*
13207 * If the load IA32_EFER VM-entry control is 1, the following checks
13208 * are performed on the field for the IA32_EFER MSR:
13209 * - Bits reserved in the IA32_EFER MSR must be 0.
13210 * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of
13211 * the IA-32e mode guest VM-exit control. It must also be identical
13212 * to bit 8 (LME) if bit 31 in the CR0 field (corresponding to
13213 * CR0.PG) is 1.
13214 */
13215 if (to_vmx(vcpu)->nested.nested_run_pending &&
13216 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) {
13217 ia32e = (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) != 0;
13218 if (!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer) ||
13219 ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA) ||
13220 ((vmcs12->guest_cr0 & X86_CR0_PG) &&
13221 ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME)))
13222 return 1;
13223 }
13224
13225 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) &&
13226 (is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu) ||
13227 (vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD)))
13228 return 1;
13229
13230 return 0;
13231}
13232
13233static int __noclone nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
13234{
13235 struct vcpu_vmx *vmx = to_vmx(vcpu);
13236 unsigned long cr3, cr4;
13237
13238 if (!nested_early_check)
13239 return 0;
13240
13241 if (vmx->msr_autoload.host.nr)
13242 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
13243 if (vmx->msr_autoload.guest.nr)
13244 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
13245
13246 preempt_disable();
13247
13248 vmx_prepare_switch_to_guest(vcpu);
13249
13250 /*
13251 * Induce a consistency check VMExit by clearing bit 1 in GUEST_RFLAGS,
13252 * which is reserved to '1' by hardware. GUEST_RFLAGS is guaranteed to
13253 * be written (by preparve_vmcs02()) before the "real" VMEnter, i.e.
13254 * there is no need to preserve other bits or save/restore the field.
13255 */
13256 vmcs_writel(GUEST_RFLAGS, 0);
13257
13258 vmcs_writel(HOST_RIP, vmx_early_consistency_check_return);
13259
13260 cr3 = __get_current_cr3_fast();
13261 if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) {
13262 vmcs_writel(HOST_CR3, cr3);
13263 vmx->loaded_vmcs->host_state.cr3 = cr3;
13264 }
13265
13266 cr4 = cr4_read_shadow();
13267 if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) {
13268 vmcs_writel(HOST_CR4, cr4);
13269 vmx->loaded_vmcs->host_state.cr4 = cr4;
13270 }
13271
13272 vmx->__launched = vmx->loaded_vmcs->launched;
13273
13274 asm(
13275 /* Set HOST_RSP */
13276 __ex("vmwrite %%" _ASM_SP ", %%" _ASM_DX) "\n\t"
13277 "mov %%" _ASM_SP ", %c[host_rsp](%0)\n\t"
13278
13279 /* Check if vmlaunch of vmresume is needed */
13280 "cmpl $0, %c[launched](%0)\n\t"
13281 "je 1f\n\t"
13282 __ex("vmresume") "\n\t"
13283 "jmp 2f\n\t"
13284 "1: " __ex("vmlaunch") "\n\t"
13285 "jmp 2f\n\t"
13286 "2: "
13287
13288 /* Set vmx->fail accordingly */
13289 "setbe %c[fail](%0)\n\t"
13290
13291 ".pushsection .rodata\n\t"
13292 ".global vmx_early_consistency_check_return\n\t"
13293 "vmx_early_consistency_check_return: " _ASM_PTR " 2b\n\t"
13294 ".popsection"
13295 :
13296 : "c"(vmx), "d"((unsigned long)HOST_RSP),
13297 [launched]"i"(offsetof(struct vcpu_vmx, __launched)),
13298 [fail]"i"(offsetof(struct vcpu_vmx, fail)),
13299 [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp))
13300 : "rax", "cc", "memory"
13301 );
13302
13303 vmcs_writel(HOST_RIP, vmx_return);
13304
13305 preempt_enable();
13306
13307 if (vmx->msr_autoload.host.nr)
13308 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
13309 if (vmx->msr_autoload.guest.nr)
13310 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
13311
13312 if (vmx->fail) {
13313 WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) !=
13314 VMXERR_ENTRY_INVALID_CONTROL_FIELD);
13315 vmx->fail = 0;
13316 return 1;
13317 }
13318
13319 /*
13320 * VMExit clears RFLAGS.IF and DR7, even on a consistency check.
13321 */
13322 local_irq_enable();
13323 if (hw_breakpoint_active())
13324 set_debugreg(__this_cpu_read(cpu_dr7), 7);
13325
13326 /*
13327 * A non-failing VMEntry means we somehow entered guest mode with
13328 * an illegal RIP, and that's just the tip of the iceberg. There
13329 * is no telling what memory has been modified or what state has
13330 * been exposed to unknown code. Hitting this all but guarantees
13331 * a (very critical) hardware issue.
13332 */
13333 WARN_ON(!(vmcs_read32(VM_EXIT_REASON) &
13334 VMX_EXIT_REASONS_FAILED_VMENTRY));
13335
13336 return 0;
13337}
13338STACK_FRAME_NON_STANDARD(nested_vmx_check_vmentry_hw);
13339
13340static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
13341 struct vmcs12 *vmcs12);
13342
13343/*
13344 * If from_vmentry is false, this is being called from state restore (either RSM
13345 * or KVM_SET_NESTED_STATE). Otherwise it's called from vmlaunch/vmresume.
13346+ *
13347+ * Returns:
13348+ * 0 - success, i.e. proceed with actual VMEnter
13349+ * 1 - consistency check VMExit
13350+ * -1 - consistency check VMFail
13351 */
13352static int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
13353 bool from_vmentry)
13354{
13355 struct vcpu_vmx *vmx = to_vmx(vcpu);
13356 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
13357 bool evaluate_pending_interrupts;
13358 u32 exit_reason = EXIT_REASON_INVALID_STATE;
13359 u32 exit_qual;
13360
13361 evaluate_pending_interrupts = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) &
13362 (CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_VIRTUAL_NMI_PENDING);
13363 if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu))
13364 evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu);
13365
13366 if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
13367 vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
13368 if (kvm_mpx_supported() &&
13369 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
13370 vmx->nested.vmcs01_guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
13371
13372 vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02);
13373
13374 prepare_vmcs02_early(vmx, vmcs12);
13375
13376 if (from_vmentry) {
13377 nested_get_vmcs12_pages(vcpu);
13378
13379 if (nested_vmx_check_vmentry_hw(vcpu)) {
13380 vmx_switch_vmcs(vcpu, &vmx->vmcs01);
13381 return -1;
13382 }
13383
13384 if (check_vmentry_postreqs(vcpu, vmcs12, &exit_qual))
13385 goto vmentry_fail_vmexit;
13386 }
13387
13388 enter_guest_mode(vcpu);
13389 if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
13390 vcpu->arch.tsc_offset += vmcs12->tsc_offset;
13391
13392 if (prepare_vmcs02(vcpu, vmcs12, &exit_qual))
13393 goto vmentry_fail_vmexit_guest_mode;
13394
13395 if (from_vmentry) {
13396 exit_reason = EXIT_REASON_MSR_LOAD_FAIL;
13397 exit_qual = nested_vmx_load_msr(vcpu,
13398 vmcs12->vm_entry_msr_load_addr,
13399 vmcs12->vm_entry_msr_load_count);
13400 if (exit_qual)
13401 goto vmentry_fail_vmexit_guest_mode;
13402 } else {
13403 /*
13404 * The MMU is not initialized to point at the right entities yet and
13405 * "get pages" would need to read data from the guest (i.e. we will
13406 * need to perform gpa to hpa translation). Request a call
13407 * to nested_get_vmcs12_pages before the next VM-entry. The MSRs
13408 * have already been set at vmentry time and should not be reset.
13409 */
13410 kvm_make_request(KVM_REQ_GET_VMCS12_PAGES, vcpu);
13411 }
13412
13413 /*
13414 * If L1 had a pending IRQ/NMI until it executed
13415 * VMLAUNCH/VMRESUME which wasn't delivered because it was
13416 * disallowed (e.g. interrupts disabled), L0 needs to
13417 * evaluate if this pending event should cause an exit from L2
13418 * to L1 or delivered directly to L2 (e.g. In case L1 don't
13419 * intercept EXTERNAL_INTERRUPT).
13420 *
13421 * Usually this would be handled by the processor noticing an
13422 * IRQ/NMI window request, or checking RVI during evaluation of
13423 * pending virtual interrupts. However, this setting was done
13424 * on VMCS01 and now VMCS02 is active instead. Thus, we force L0
13425 * to perform pending event evaluation by requesting a KVM_REQ_EVENT.
13426 */
13427 if (unlikely(evaluate_pending_interrupts))
13428 kvm_make_request(KVM_REQ_EVENT, vcpu);
13429
13430 /*
13431 * Note no nested_vmx_succeed or nested_vmx_fail here. At this point
13432 * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet
13433 * returned as far as L1 is concerned. It will only return (and set
13434 * the success flag) when L2 exits (see nested_vmx_vmexit()).
13435 */
13436 return 0;
13437
13438 /*
13439 * A failed consistency check that leads to a VMExit during L1's
13440 * VMEnter to L2 is a variation of a normal VMexit, as explained in
13441 * 26.7 "VM-entry failures during or after loading guest state".
13442 */
13443vmentry_fail_vmexit_guest_mode:
13444 if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
13445 vcpu->arch.tsc_offset -= vmcs12->tsc_offset;
13446 leave_guest_mode(vcpu);
13447
13448vmentry_fail_vmexit:
13449 vmx_switch_vmcs(vcpu, &vmx->vmcs01);
13450
13451 if (!from_vmentry)
13452 return 1;
13453
13454 load_vmcs12_host_state(vcpu, vmcs12);
13455 vmcs12->vm_exit_reason = exit_reason | VMX_EXIT_REASONS_FAILED_VMENTRY;
13456 vmcs12->exit_qualification = exit_qual;
13457 if (enable_shadow_vmcs || vmx->nested.hv_evmcs)
13458 vmx->nested.need_vmcs12_sync = true;
13459 return 1;
13460}
13461
13462/*
13463 * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1
13464 * for running an L2 nested guest.
13465 */
13466static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
13467{
13468 struct vmcs12 *vmcs12;
13469 struct vcpu_vmx *vmx = to_vmx(vcpu);
13470 u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu);
13471 int ret;
13472
13473 if (!nested_vmx_check_permission(vcpu))
13474 return 1;
13475
13476 if (!nested_vmx_handle_enlightened_vmptrld(vcpu, true))
13477 return 1;
13478
13479 if (!vmx->nested.hv_evmcs && vmx->nested.current_vmptr == -1ull)
13480 return nested_vmx_failInvalid(vcpu);
13481
13482 vmcs12 = get_vmcs12(vcpu);
13483
13484 /*
13485 * Can't VMLAUNCH or VMRESUME a shadow VMCS. Despite the fact
13486 * that there *is* a valid VMCS pointer, RFLAGS.CF is set
13487 * rather than RFLAGS.ZF, and no error number is stored to the
13488 * VM-instruction error field.
13489 */
13490 if (vmcs12->hdr.shadow_vmcs)
13491 return nested_vmx_failInvalid(vcpu);
13492
13493 if (vmx->nested.hv_evmcs) {
13494 copy_enlightened_to_vmcs12(vmx);
13495 /* Enlightened VMCS doesn't have launch state */
13496 vmcs12->launch_state = !launch;
13497 } else if (enable_shadow_vmcs) {
13498 copy_shadow_to_vmcs12(vmx);
13499 }
13500
13501 /*
13502 * The nested entry process starts with enforcing various prerequisites
13503 * on vmcs12 as required by the Intel SDM, and act appropriately when
13504 * they fail: As the SDM explains, some conditions should cause the
13505 * instruction to fail, while others will cause the instruction to seem
13506 * to succeed, but return an EXIT_REASON_INVALID_STATE.
13507 * To speed up the normal (success) code path, we should avoid checking
13508 * for misconfigurations which will anyway be caught by the processor
13509 * when using the merged vmcs02.
13510 */
13511 if (interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS)
13512 return nested_vmx_failValid(vcpu,
13513 VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS);
13514
13515 if (vmcs12->launch_state == launch)
13516 return nested_vmx_failValid(vcpu,
13517 launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS
13518 : VMXERR_VMRESUME_NONLAUNCHED_VMCS);
13519
13520 ret = check_vmentry_prereqs(vcpu, vmcs12);
13521 if (ret)
13522 return nested_vmx_failValid(vcpu, ret);
13523
13524 /*
13525 * We're finally done with prerequisite checking, and can start with
13526 * the nested entry.
13527 */
13528 vmx->nested.nested_run_pending = 1;
13529 ret = nested_vmx_enter_non_root_mode(vcpu, true);
13530 vmx->nested.nested_run_pending = !ret;
13531 if (ret > 0)
13532 return 1;
13533 else if (ret)
13534 return nested_vmx_failValid(vcpu,
13535 VMXERR_ENTRY_INVALID_CONTROL_FIELD);
13536
13537 /* Hide L1D cache contents from the nested guest. */
13538 vmx->vcpu.arch.l1tf_flush_l1d = true;
13539
13540 /*
13541 * Must happen outside of nested_vmx_enter_non_root_mode() as it will
13542 * also be used as part of restoring nVMX state for
13543 * snapshot restore (migration).
13544 *
13545 * In this flow, it is assumed that vmcs12 cache was
13546 * trasferred as part of captured nVMX state and should
13547 * therefore not be read from guest memory (which may not
13548 * exist on destination host yet).
13549 */
13550 nested_cache_shadow_vmcs12(vcpu, vmcs12);
13551
13552 /*
13553 * If we're entering a halted L2 vcpu and the L2 vcpu won't be woken
13554 * by event injection, halt vcpu.
13555 */
13556 if ((vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT) &&
13557 !(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK)) {
13558 vmx->nested.nested_run_pending = 0;
13559 return kvm_vcpu_halt(vcpu);
13560 }
13561 return 1;
13562}
13563
13564/*
13565 * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date
13566 * because L2 may have changed some cr0 bits directly (CRO_GUEST_HOST_MASK).
13567 * This function returns the new value we should put in vmcs12.guest_cr0.
13568 * It's not enough to just return the vmcs02 GUEST_CR0. Rather,
13569 * 1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now
13570 * available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0
13571 * didn't trap the bit, because if L1 did, so would L0).
13572 * 2. Bits that L1 asked to trap (and therefore L0 also did) could not have
13573 * been modified by L2, and L1 knows it. So just leave the old value of
13574 * the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0
13575 * isn't relevant, because if L0 traps this bit it can set it to anything.
13576 * 3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have
13577 * changed these bits, and therefore they need to be updated, but L0
13578 * didn't necessarily allow them to be changed in GUEST_CR0 - and rather
13579 * put them in vmcs02 CR0_READ_SHADOW. So take these bits from there.
13580 */
13581static inline unsigned long
13582vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
13583{
13584 return
13585 /*1*/ (vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) |
13586 /*2*/ (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) |
13587 /*3*/ (vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask |
13588 vcpu->arch.cr0_guest_owned_bits));
13589}
13590
13591static inline unsigned long
13592vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
13593{
13594 return
13595 /*1*/ (vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) |
13596 /*2*/ (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) |
13597 /*3*/ (vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask |
13598 vcpu->arch.cr4_guest_owned_bits));
13599}
13600
13601static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu,
13602 struct vmcs12 *vmcs12)
13603{
13604 u32 idt_vectoring;
13605 unsigned int nr;
13606
13607 if (vcpu->arch.exception.injected) {
13608 nr = vcpu->arch.exception.nr;
13609 idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
13610
13611 if (kvm_exception_is_soft(nr)) {
13612 vmcs12->vm_exit_instruction_len =
13613 vcpu->arch.event_exit_inst_len;
13614 idt_vectoring |= INTR_TYPE_SOFT_EXCEPTION;
13615 } else
13616 idt_vectoring |= INTR_TYPE_HARD_EXCEPTION;
13617
13618 if (vcpu->arch.exception.has_error_code) {
13619 idt_vectoring |= VECTORING_INFO_DELIVER_CODE_MASK;
13620 vmcs12->idt_vectoring_error_code =
13621 vcpu->arch.exception.error_code;
13622 }
13623
13624 vmcs12->idt_vectoring_info_field = idt_vectoring;
13625 } else if (vcpu->arch.nmi_injected) {
13626 vmcs12->idt_vectoring_info_field =
13627 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR;
13628 } else if (vcpu->arch.interrupt.injected) {
13629 nr = vcpu->arch.interrupt.nr;
13630 idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
13631
13632 if (vcpu->arch.interrupt.soft) {
13633 idt_vectoring |= INTR_TYPE_SOFT_INTR;
13634 vmcs12->vm_entry_instruction_len =
13635 vcpu->arch.event_exit_inst_len;
13636 } else
13637 idt_vectoring |= INTR_TYPE_EXT_INTR;
13638
13639 vmcs12->idt_vectoring_info_field = idt_vectoring;
13640 }
13641}
13642
13643static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr)
13644{
13645 struct vcpu_vmx *vmx = to_vmx(vcpu);
13646 unsigned long exit_qual;
13647 bool block_nested_events =
13648 vmx->nested.nested_run_pending || kvm_event_needs_reinjection(vcpu);
13649
13650 if (vcpu->arch.exception.pending &&
13651 nested_vmx_check_exception(vcpu, &exit_qual)) {
13652 if (block_nested_events)
13653 return -EBUSY;
13654 nested_vmx_inject_exception_vmexit(vcpu, exit_qual);
13655 return 0;
13656 }
13657
13658 if (nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) &&
13659 vmx->nested.preemption_timer_expired) {
13660 if (block_nested_events)
13661 return -EBUSY;
13662 nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0);
13663 return 0;
13664 }
13665
13666 if (vcpu->arch.nmi_pending && nested_exit_on_nmi(vcpu)) {
13667 if (block_nested_events)
13668 return -EBUSY;
13669 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
13670 NMI_VECTOR | INTR_TYPE_NMI_INTR |
13671 INTR_INFO_VALID_MASK, 0);
13672 /*
13673 * The NMI-triggered VM exit counts as injection:
13674 * clear this one and block further NMIs.
13675 */
13676 vcpu->arch.nmi_pending = 0;
13677 vmx_set_nmi_mask(vcpu, true);
13678 return 0;
13679 }
13680
13681 if ((kvm_cpu_has_interrupt(vcpu) || external_intr) &&
13682 nested_exit_on_intr(vcpu)) {
13683 if (block_nested_events)
13684 return -EBUSY;
13685 nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0);
13686 return 0;
13687 }
13688
13689 vmx_complete_nested_posted_interrupt(vcpu);
13690 return 0;
13691}
13692
13693static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu)
13694{
13695 to_vmx(vcpu)->req_immediate_exit = true;
13696}
13697
13698static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu)
13699{
13700 ktime_t remaining =
13701 hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer);
13702 u64 value;
13703
13704 if (ktime_to_ns(remaining) <= 0)
13705 return 0;
13706
13707 value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz;
13708 do_div(value, 1000000);
13709 return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
13710}
13711
13712/*
13713 * Update the guest state fields of vmcs12 to reflect changes that
13714 * occurred while L2 was running. (The "IA-32e mode guest" bit of the
13715 * VM-entry controls is also updated, since this is really a guest
13716 * state bit.)
13717 */
13718static void sync_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
13719{
13720 vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12);
13721 vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12);
13722
13723 vmcs12->guest_rsp = kvm_register_read(vcpu, VCPU_REGS_RSP);
13724 vmcs12->guest_rip = kvm_register_read(vcpu, VCPU_REGS_RIP);
13725 vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS);
13726
13727 vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR);
13728 vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR);
13729 vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR);
13730 vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR);
13731 vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR);
13732 vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR);
13733 vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR);
13734 vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR);
13735 vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT);
13736 vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT);
13737 vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT);
13738 vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT);
13739 vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT);
13740 vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT);
13741 vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT);
13742 vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT);
13743 vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT);
13744 vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT);
13745 vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES);
13746 vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES);
13747 vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES);
13748 vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES);
13749 vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES);
13750 vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES);
13751 vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES);
13752 vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES);
13753 vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE);
13754 vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE);
13755 vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE);
13756 vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE);
13757 vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE);
13758 vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE);
13759 vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE);
13760 vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE);
13761 vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE);
13762 vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
13763
13764 vmcs12->guest_interruptibility_info =
13765 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
13766 vmcs12->guest_pending_dbg_exceptions =
13767 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
13768 if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
13769 vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT;
13770 else
13771 vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE;
13772
13773 if (nested_cpu_has_preemption_timer(vmcs12)) {
13774 if (vmcs12->vm_exit_controls &
13775 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
13776 vmcs12->vmx_preemption_timer_value =
13777 vmx_get_preemption_timer_value(vcpu);
13778 hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer);
13779 }
13780
13781 /*
13782 * In some cases (usually, nested EPT), L2 is allowed to change its
13783 * own CR3 without exiting. If it has changed it, we must keep it.
13784 * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined
13785 * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12.
13786 *
13787 * Additionally, restore L2's PDPTR to vmcs12.
13788 */
13789 if (enable_ept) {
13790 vmcs12->guest_cr3 = vmcs_readl(GUEST_CR3);
13791 vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0);
13792 vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1);
13793 vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2);
13794 vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
13795 }
13796
13797 vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS);
13798
13799 if (nested_cpu_has_vid(vmcs12))
13800 vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS);
13801
13802 vmcs12->vm_entry_controls =
13803 (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) |
13804 (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE);
13805
13806 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) {
13807 kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7);
13808 vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
13809 }
13810
13811 /* TODO: These cannot have changed unless we have MSR bitmaps and
13812 * the relevant bit asks not to trap the change */
13813 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT)
13814 vmcs12->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT);
13815 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER)
13816 vmcs12->guest_ia32_efer = vcpu->arch.efer;
13817 vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
13818 vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
13819 vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
13820 if (kvm_mpx_supported())
13821 vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
13822}
13823
13824/*
13825 * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits
13826 * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12),
13827 * and this function updates it to reflect the changes to the guest state while
13828 * L2 was running (and perhaps made some exits which were handled directly by L0
13829 * without going back to L1), and to reflect the exit reason.
13830 * Note that we do not have to copy here all VMCS fields, just those that
13831 * could have changed by the L2 guest or the exit - i.e., the guest-state and
13832 * exit-information fields only. Other fields are modified by L1 with VMWRITE,
13833 * which already writes to vmcs12 directly.
13834 */
13835static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
13836 u32 exit_reason, u32 exit_intr_info,
13837 unsigned long exit_qualification)
13838{
13839 /* update guest state fields: */
13840 sync_vmcs12(vcpu, vmcs12);
13841
13842 /* update exit information fields: */
13843
13844 vmcs12->vm_exit_reason = exit_reason;
13845 vmcs12->exit_qualification = exit_qualification;
13846 vmcs12->vm_exit_intr_info = exit_intr_info;
13847
13848 vmcs12->idt_vectoring_info_field = 0;
13849 vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
13850 vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
13851
13852 if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) {
13853 vmcs12->launch_state = 1;
13854
13855 /* vm_entry_intr_info_field is cleared on exit. Emulate this
13856 * instead of reading the real value. */
13857 vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK;
13858
13859 /*
13860 * Transfer the event that L0 or L1 may wanted to inject into
13861 * L2 to IDT_VECTORING_INFO_FIELD.
13862 */
13863 vmcs12_save_pending_event(vcpu, vmcs12);
13864 }
13865
13866 /*
13867 * Drop what we picked up for L2 via vmx_complete_interrupts. It is
13868 * preserved above and would only end up incorrectly in L1.
13869 */
13870 vcpu->arch.nmi_injected = false;
13871 kvm_clear_exception_queue(vcpu);
13872 kvm_clear_interrupt_queue(vcpu);
13873}
13874
13875/*
13876 * A part of what we need to when the nested L2 guest exits and we want to
13877 * run its L1 parent, is to reset L1's guest state to the host state specified
13878 * in vmcs12.
13879 * This function is to be called not only on normal nested exit, but also on
13880 * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry
13881 * Failures During or After Loading Guest State").
13882 * This function should be called when the active VMCS is L1's (vmcs01).
13883 */
13884static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
13885 struct vmcs12 *vmcs12)
13886{
13887 struct kvm_segment seg;
13888 u32 entry_failure_code;
13889
13890 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER)
13891 vcpu->arch.efer = vmcs12->host_ia32_efer;
13892 else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
13893 vcpu->arch.efer |= (EFER_LMA | EFER_LME);
13894 else
13895 vcpu->arch.efer &= ~(EFER_LMA | EFER_LME);
13896 vmx_set_efer(vcpu, vcpu->arch.efer);
13897
13898 kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->host_rsp);
13899 kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->host_rip);
13900 vmx_set_rflags(vcpu, X86_EFLAGS_FIXED);
13901 vmx_set_interrupt_shadow(vcpu, 0);
13902
13903 /*
13904 * Note that calling vmx_set_cr0 is important, even if cr0 hasn't
13905 * actually changed, because vmx_set_cr0 refers to efer set above.
13906 *
13907 * CR0_GUEST_HOST_MASK is already set in the original vmcs01
13908 * (KVM doesn't change it);
13909 */
13910 vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS;
13911 vmx_set_cr0(vcpu, vmcs12->host_cr0);
13912
13913 /* Same as above - no reason to call set_cr4_guest_host_mask(). */
13914 vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
13915 vmx_set_cr4(vcpu, vmcs12->host_cr4);
13916
13917 nested_ept_uninit_mmu_context(vcpu);
13918
13919 /*
13920 * Only PDPTE load can fail as the value of cr3 was checked on entry and
13921 * couldn't have changed.
13922 */
13923 if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &entry_failure_code))
13924 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL);
13925
13926 if (!enable_ept)
13927 vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
13928
13929 /*
13930 * If vmcs01 doesn't use VPID, CPU flushes TLB on every
13931 * VMEntry/VMExit. Thus, no need to flush TLB.
13932 *
13933 * If vmcs12 doesn't use VPID, L1 expects TLB to be
13934 * flushed on every VMEntry/VMExit.
13935 *
13936 * Otherwise, we can preserve TLB entries as long as we are
13937 * able to tag L1 TLB entries differently than L2 TLB entries.
13938 *
13939 * If vmcs12 uses EPT, we need to execute this flush on EPTP01
13940 * and therefore we request the TLB flush to happen only after VMCS EPTP
13941 * has been set by KVM_REQ_LOAD_CR3.
13942 */
13943 if (enable_vpid &&
13944 (!nested_cpu_has_vpid(vmcs12) || !nested_has_guest_tlb_tag(vcpu))) {
13945 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
13946 }
13947
13948 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs);
13949 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp);
13950 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip);
13951 vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base);
13952 vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base);
13953 vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF);
13954 vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF);
13955
13956 /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */
13957 if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS)
13958 vmcs_write64(GUEST_BNDCFGS, 0);
13959
13960 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) {
13961 vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat);
13962 vcpu->arch.pat = vmcs12->host_ia32_pat;
13963 }
13964 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
13965 vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL,
13966 vmcs12->host_ia32_perf_global_ctrl);
13967
13968 /* Set L1 segment info according to Intel SDM
13969 27.5.2 Loading Host Segment and Descriptor-Table Registers */
13970 seg = (struct kvm_segment) {
13971 .base = 0,
13972 .limit = 0xFFFFFFFF,
13973 .selector = vmcs12->host_cs_selector,
13974 .type = 11,
13975 .present = 1,
13976 .s = 1,
13977 .g = 1
13978 };
13979 if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
13980 seg.l = 1;
13981 else
13982 seg.db = 1;
13983 vmx_set_segment(vcpu, &seg, VCPU_SREG_CS);
13984 seg = (struct kvm_segment) {
13985 .base = 0,
13986 .limit = 0xFFFFFFFF,
13987 .type = 3,
13988 .present = 1,
13989 .s = 1,
13990 .db = 1,
13991 .g = 1
13992 };
13993 seg.selector = vmcs12->host_ds_selector;
13994 vmx_set_segment(vcpu, &seg, VCPU_SREG_DS);
13995 seg.selector = vmcs12->host_es_selector;
13996 vmx_set_segment(vcpu, &seg, VCPU_SREG_ES);
13997 seg.selector = vmcs12->host_ss_selector;
13998 vmx_set_segment(vcpu, &seg, VCPU_SREG_SS);
13999 seg.selector = vmcs12->host_fs_selector;
14000 seg.base = vmcs12->host_fs_base;
14001 vmx_set_segment(vcpu, &seg, VCPU_SREG_FS);
14002 seg.selector = vmcs12->host_gs_selector;
14003 seg.base = vmcs12->host_gs_base;
14004 vmx_set_segment(vcpu, &seg, VCPU_SREG_GS);
14005 seg = (struct kvm_segment) {
14006 .base = vmcs12->host_tr_base,
14007 .limit = 0x67,
14008 .selector = vmcs12->host_tr_selector,
14009 .type = 11,
14010 .present = 1
14011 };
14012 vmx_set_segment(vcpu, &seg, VCPU_SREG_TR);
14013
14014 kvm_set_dr(vcpu, 7, 0x400);
14015 vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
14016
14017 if (cpu_has_vmx_msr_bitmap())
14018 vmx_update_msr_bitmap(vcpu);
14019
14020 if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr,
14021 vmcs12->vm_exit_msr_load_count))
14022 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
14023}
14024
14025static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx)
14026{
14027 struct shared_msr_entry *efer_msr;
14028 unsigned int i;
14029
14030 if (vm_entry_controls_get(vmx) & VM_ENTRY_LOAD_IA32_EFER)
14031 return vmcs_read64(GUEST_IA32_EFER);
14032
14033 if (cpu_has_load_ia32_efer)
14034 return host_efer;
14035
14036 for (i = 0; i < vmx->msr_autoload.guest.nr; ++i) {
14037 if (vmx->msr_autoload.guest.val[i].index == MSR_EFER)
14038 return vmx->msr_autoload.guest.val[i].value;
14039 }
14040
14041 efer_msr = find_msr_entry(vmx, MSR_EFER);
14042 if (efer_msr)
14043 return efer_msr->data;
14044
14045 return host_efer;
14046}
14047
14048static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu)
14049{
14050 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
14051 struct vcpu_vmx *vmx = to_vmx(vcpu);
14052 struct vmx_msr_entry g, h;
14053 struct msr_data msr;
14054 gpa_t gpa;
14055 u32 i, j;
14056
14057 vcpu->arch.pat = vmcs_read64(GUEST_IA32_PAT);
14058
14059 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) {
14060 /*
14061 * L1's host DR7 is lost if KVM_GUESTDBG_USE_HW_BP is set
14062 * as vmcs01.GUEST_DR7 contains a userspace defined value
14063 * and vcpu->arch.dr7 is not squirreled away before the
14064 * nested VMENTER (not worth adding a variable in nested_vmx).
14065 */
14066 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
14067 kvm_set_dr(vcpu, 7, DR7_FIXED_1);
14068 else
14069 WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7)));
14070 }
14071
14072 /*
14073 * Note that calling vmx_set_{efer,cr0,cr4} is important as they
14074 * handle a variety of side effects to KVM's software model.
14075 */
14076 vmx_set_efer(vcpu, nested_vmx_get_vmcs01_guest_efer(vmx));
14077
14078 vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS;
14079 vmx_set_cr0(vcpu, vmcs_readl(CR0_READ_SHADOW));
14080
14081 vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
14082 vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW));
14083
14084 nested_ept_uninit_mmu_context(vcpu);
14085 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
14086 __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
14087
14088 /*
14089 * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs
14090 * from vmcs01 (if necessary). The PDPTRs are not loaded on
14091 * VMFail, like everything else we just need to ensure our
14092 * software model is up-to-date.
14093 */
14094 ept_save_pdptrs(vcpu);
14095
14096 kvm_mmu_reset_context(vcpu);
14097
14098 if (cpu_has_vmx_msr_bitmap())
14099 vmx_update_msr_bitmap(vcpu);
14100
14101 /*
14102 * This nasty bit of open coding is a compromise between blindly
14103 * loading L1's MSRs using the exit load lists (incorrect emulation
14104 * of VMFail), leaving the nested VM's MSRs in the software model
14105 * (incorrect behavior) and snapshotting the modified MSRs (too
14106 * expensive since the lists are unbound by hardware). For each
14107 * MSR that was (prematurely) loaded from the nested VMEntry load
14108 * list, reload it from the exit load list if it exists and differs
14109 * from the guest value. The intent is to stuff host state as
14110 * silently as possible, not to fully process the exit load list.
14111 */
14112 msr.host_initiated = false;
14113 for (i = 0; i < vmcs12->vm_entry_msr_load_count; i++) {
14114 gpa = vmcs12->vm_entry_msr_load_addr + (i * sizeof(g));
14115 if (kvm_vcpu_read_guest(vcpu, gpa, &g, sizeof(g))) {
14116 pr_debug_ratelimited(
14117 "%s read MSR index failed (%u, 0x%08llx)\n",
14118 __func__, i, gpa);
14119 goto vmabort;
14120 }
14121
14122 for (j = 0; j < vmcs12->vm_exit_msr_load_count; j++) {
14123 gpa = vmcs12->vm_exit_msr_load_addr + (j * sizeof(h));
14124 if (kvm_vcpu_read_guest(vcpu, gpa, &h, sizeof(h))) {
14125 pr_debug_ratelimited(
14126 "%s read MSR failed (%u, 0x%08llx)\n",
14127 __func__, j, gpa);
14128 goto vmabort;
14129 }
14130 if (h.index != g.index)
14131 continue;
14132 if (h.value == g.value)
14133 break;
14134
14135 if (nested_vmx_load_msr_check(vcpu, &h)) {
14136 pr_debug_ratelimited(
14137 "%s check failed (%u, 0x%x, 0x%x)\n",
14138 __func__, j, h.index, h.reserved);
14139 goto vmabort;
14140 }
14141
14142 msr.index = h.index;
14143 msr.data = h.value;
14144 if (kvm_set_msr(vcpu, &msr)) {
14145 pr_debug_ratelimited(
14146 "%s WRMSR failed (%u, 0x%x, 0x%llx)\n",
14147 __func__, j, h.index, h.value);
14148 goto vmabort;
14149 }
14150 }
14151 }
14152
14153 return;
14154
14155vmabort:
14156 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
14157}
14158
14159/*
14160 * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1
14161 * and modify vmcs12 to make it see what it would expect to see there if
14162 * L2 was its real guest. Must only be called when in L2 (is_guest_mode())
14163 */
14164static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
14165 u32 exit_intr_info,
14166 unsigned long exit_qualification)
14167{
14168 struct vcpu_vmx *vmx = to_vmx(vcpu);
14169 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
14170
14171 /* trying to cancel vmlaunch/vmresume is a bug */
14172 WARN_ON_ONCE(vmx->nested.nested_run_pending);
14173
14174 leave_guest_mode(vcpu);
14175
14176 if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
14177 vcpu->arch.tsc_offset -= vmcs12->tsc_offset;
14178
14179 if (likely(!vmx->fail)) {
14180 if (exit_reason == -1)
14181 sync_vmcs12(vcpu, vmcs12);
14182 else
14183 prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info,
14184 exit_qualification);
14185
14186 /*
14187 * Must happen outside of sync_vmcs12() as it will
14188 * also be used to capture vmcs12 cache as part of
14189 * capturing nVMX state for snapshot (migration).
14190 *
14191 * Otherwise, this flush will dirty guest memory at a
14192 * point it is already assumed by user-space to be
14193 * immutable.
14194 */
14195 nested_flush_cached_shadow_vmcs12(vcpu, vmcs12);
14196
14197 if (nested_vmx_store_msr(vcpu, vmcs12->vm_exit_msr_store_addr,
14198 vmcs12->vm_exit_msr_store_count))
14199 nested_vmx_abort(vcpu, VMX_ABORT_SAVE_GUEST_MSR_FAIL);
14200 } else {
14201 /*
14202 * The only expected VM-instruction error is "VM entry with
14203 * invalid control field(s)." Anything else indicates a
14204 * problem with L0. And we should never get here with a
14205 * VMFail of any type if early consistency checks are enabled.
14206 */
14207 WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) !=
14208 VMXERR_ENTRY_INVALID_CONTROL_FIELD);
14209 WARN_ON_ONCE(nested_early_check);
14210 }
14211
14212 vmx_switch_vmcs(vcpu, &vmx->vmcs01);
14213
14214 /* Update any VMCS fields that might have changed while L2 ran */
14215 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
14216 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
14217 vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
14218
14219 if (kvm_has_tsc_control)
14220 decache_tsc_multiplier(vmx);
14221
14222 if (vmx->nested.change_vmcs01_virtual_apic_mode) {
14223 vmx->nested.change_vmcs01_virtual_apic_mode = false;
14224 vmx_set_virtual_apic_mode(vcpu);
14225 } else if (!nested_cpu_has_ept(vmcs12) &&
14226 nested_cpu_has2(vmcs12,
14227 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
14228 vmx_flush_tlb(vcpu, true);
14229 }
14230
14231 /* This is needed for same reason as it was needed in prepare_vmcs02 */
14232 vmx->host_rsp = 0;
14233
14234 /* Unpin physical memory we referred to in vmcs02 */
14235 if (vmx->nested.apic_access_page) {
14236 kvm_release_page_dirty(vmx->nested.apic_access_page);
14237 vmx->nested.apic_access_page = NULL;
14238 }
14239 if (vmx->nested.virtual_apic_page) {
14240 kvm_release_page_dirty(vmx->nested.virtual_apic_page);
14241 vmx->nested.virtual_apic_page = NULL;
14242 }
14243 if (vmx->nested.pi_desc_page) {
14244 kunmap(vmx->nested.pi_desc_page);
14245 kvm_release_page_dirty(vmx->nested.pi_desc_page);
14246 vmx->nested.pi_desc_page = NULL;
14247 vmx->nested.pi_desc = NULL;
14248 }
14249
14250 /*
14251 * We are now running in L2, mmu_notifier will force to reload the
14252 * page's hpa for L2 vmcs. Need to reload it for L1 before entering L1.
14253 */
14254 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
14255
14256 if ((exit_reason != -1) && (enable_shadow_vmcs || vmx->nested.hv_evmcs))
14257 vmx->nested.need_vmcs12_sync = true;
14258
14259 /* in case we halted in L2 */
14260 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
14261
14262 if (likely(!vmx->fail)) {
14263 /*
14264 * TODO: SDM says that with acknowledge interrupt on
14265 * exit, bit 31 of the VM-exit interrupt information
14266 * (valid interrupt) is always set to 1 on
14267 * EXIT_REASON_EXTERNAL_INTERRUPT, so we shouldn't
14268 * need kvm_cpu_has_interrupt(). See the commit
14269 * message for details.
14270 */
14271 if (nested_exit_intr_ack_set(vcpu) &&
14272 exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT &&
14273 kvm_cpu_has_interrupt(vcpu)) {
14274 int irq = kvm_cpu_get_interrupt(vcpu);
14275 WARN_ON(irq < 0);
14276 vmcs12->vm_exit_intr_info = irq |
14277 INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR;
14278 }
14279
14280 if (exit_reason != -1)
14281 trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason,
14282 vmcs12->exit_qualification,
14283 vmcs12->idt_vectoring_info_field,
14284 vmcs12->vm_exit_intr_info,
14285 vmcs12->vm_exit_intr_error_code,
14286 KVM_ISA_VMX);
14287
14288 load_vmcs12_host_state(vcpu, vmcs12);
14289
14290 return;
14291 }
14292
14293 /*
14294 * After an early L2 VM-entry failure, we're now back
14295 * in L1 which thinks it just finished a VMLAUNCH or
14296 * VMRESUME instruction, so we need to set the failure
14297 * flag and the VM-instruction error field of the VMCS
14298 * accordingly, and skip the emulated instruction.
14299 */
14300 (void)nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
14301
14302 /*
14303 * Restore L1's host state to KVM's software model. We're here
14304 * because a consistency check was caught by hardware, which
14305 * means some amount of guest state has been propagated to KVM's
14306 * model and needs to be unwound to the host's state.
14307 */
14308 nested_vmx_restore_host_state(vcpu);
14309
14310 vmx->fail = 0;
14311}
14312
14313/*
14314 * Forcibly leave nested mode in order to be able to reset the VCPU later on.
14315 */
14316static void vmx_leave_nested(struct kvm_vcpu *vcpu)
14317{
14318 if (is_guest_mode(vcpu)) {
14319 to_vmx(vcpu)->nested.nested_run_pending = 0;
14320 nested_vmx_vmexit(vcpu, -1, 0, 0);
14321 }
14322 free_nested(vcpu);
14323}
14324
14325static int vmx_check_intercept(struct kvm_vcpu *vcpu,
14326 struct x86_instruction_info *info,
14327 enum x86_intercept_stage stage)
14328{
14329 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
14330 struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
14331
14332 /*
14333 * RDPID causes #UD if disabled through secondary execution controls.
14334 * Because it is marked as EmulateOnUD, we need to intercept it here.
14335 */
14336 if (info->intercept == x86_intercept_rdtscp &&
14337 !nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDTSCP)) {
14338 ctxt->exception.vector = UD_VECTOR;
14339 ctxt->exception.error_code_valid = false;
14340 return X86EMUL_PROPAGATE_FAULT;
14341 }
14342
14343 /* TODO: check more intercepts... */
14344 return X86EMUL_CONTINUE;
14345}
14346
14347#ifdef CONFIG_X86_64
14348/* (a << shift) / divisor, return 1 if overflow otherwise 0 */
14349static inline int u64_shl_div_u64(u64 a, unsigned int shift,
14350 u64 divisor, u64 *result)
14351{
14352 u64 low = a << shift, high = a >> (64 - shift);
14353
14354 /* To avoid the overflow on divq */
14355 if (high >= divisor)
14356 return 1;
14357
14358 /* Low hold the result, high hold rem which is discarded */
14359 asm("divq %2\n\t" : "=a" (low), "=d" (high) :
14360 "rm" (divisor), "0" (low), "1" (high));
14361 *result = low;
14362
14363 return 0;
14364}
14365
14366static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc)
14367{
14368 struct vcpu_vmx *vmx;
14369 u64 tscl, guest_tscl, delta_tsc, lapic_timer_advance_cycles;
14370
14371 if (kvm_mwait_in_guest(vcpu->kvm))
14372 return -EOPNOTSUPP;
14373
14374 vmx = to_vmx(vcpu);
14375 tscl = rdtsc();
14376 guest_tscl = kvm_read_l1_tsc(vcpu, tscl);
14377 delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl;
14378 lapic_timer_advance_cycles = nsec_to_cycles(vcpu, lapic_timer_advance_ns);
14379
14380 if (delta_tsc > lapic_timer_advance_cycles)
14381 delta_tsc -= lapic_timer_advance_cycles;
14382 else
14383 delta_tsc = 0;
14384
14385 /* Convert to host delta tsc if tsc scaling is enabled */
14386 if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio &&
14387 u64_shl_div_u64(delta_tsc,
14388 kvm_tsc_scaling_ratio_frac_bits,
14389 vcpu->arch.tsc_scaling_ratio,
14390 &delta_tsc))
14391 return -ERANGE;
14392
14393 /*
14394 * If the delta tsc can't fit in the 32 bit after the multi shift,
14395 * we can't use the preemption timer.
14396 * It's possible that it fits on later vmentries, but checking
14397 * on every vmentry is costly so we just use an hrtimer.
14398 */
14399 if (delta_tsc >> (cpu_preemption_timer_multi + 32))
14400 return -ERANGE;
14401
14402 vmx->hv_deadline_tsc = tscl + delta_tsc;
14403 return delta_tsc == 0;
14404}
14405
14406static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu)
14407{
14408 to_vmx(vcpu)->hv_deadline_tsc = -1;
14409}
14410#endif
14411
14412static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu)
14413{
14414 if (!kvm_pause_in_guest(vcpu->kvm))
14415 shrink_ple_window(vcpu);
14416}
14417
14418static void vmx_slot_enable_log_dirty(struct kvm *kvm,
14419 struct kvm_memory_slot *slot)
14420{
14421 kvm_mmu_slot_leaf_clear_dirty(kvm, slot);
14422 kvm_mmu_slot_largepage_remove_write_access(kvm, slot);
14423}
14424
14425static void vmx_slot_disable_log_dirty(struct kvm *kvm,
14426 struct kvm_memory_slot *slot)
14427{
14428 kvm_mmu_slot_set_dirty(kvm, slot);
14429}
14430
14431static void vmx_flush_log_dirty(struct kvm *kvm)
14432{
14433 kvm_flush_pml_buffers(kvm);
14434}
14435
14436static int vmx_write_pml_buffer(struct kvm_vcpu *vcpu)
14437{
14438 struct vmcs12 *vmcs12;
14439 struct vcpu_vmx *vmx = to_vmx(vcpu);
14440 gpa_t gpa;
14441 struct page *page = NULL;
14442 u64 *pml_address;
14443
14444 if (is_guest_mode(vcpu)) {
14445 WARN_ON_ONCE(vmx->nested.pml_full);
14446
14447 /*
14448 * Check if PML is enabled for the nested guest.
14449 * Whether eptp bit 6 is set is already checked
14450 * as part of A/D emulation.
14451 */
14452 vmcs12 = get_vmcs12(vcpu);
14453 if (!nested_cpu_has_pml(vmcs12))
14454 return 0;
14455
14456 if (vmcs12->guest_pml_index >= PML_ENTITY_NUM) {
14457 vmx->nested.pml_full = true;
14458 return 1;
14459 }
14460
14461 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS) & ~0xFFFull;
14462
14463 page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->pml_address);
14464 if (is_error_page(page))
14465 return 0;
14466
14467 pml_address = kmap(page);
14468 pml_address[vmcs12->guest_pml_index--] = gpa;
14469 kunmap(page);
14470 kvm_release_page_clean(page);
14471 }
14472
14473 return 0;
14474}
14475
14476static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm,
14477 struct kvm_memory_slot *memslot,
14478 gfn_t offset, unsigned long mask)
14479{
14480 kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask);
14481}
14482
14483static void __pi_post_block(struct kvm_vcpu *vcpu)
14484{
14485 struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
14486 struct pi_desc old, new;
14487 unsigned int dest;
14488
14489 do {
14490 old.control = new.control = pi_desc->control;
14491 WARN(old.nv != POSTED_INTR_WAKEUP_VECTOR,
14492 "Wakeup handler not enabled while the VCPU is blocked\n");
14493
14494 dest = cpu_physical_id(vcpu->cpu);
14495
14496 if (x2apic_enabled())
14497 new.ndst = dest;
14498 else
14499 new.ndst = (dest << 8) & 0xFF00;
14500
14501 /* set 'NV' to 'notification vector' */
14502 new.nv = POSTED_INTR_VECTOR;
14503 } while (cmpxchg64(&pi_desc->control, old.control,
14504 new.control) != old.control);
14505
14506 if (!WARN_ON_ONCE(vcpu->pre_pcpu == -1)) {
14507 spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
14508 list_del(&vcpu->blocked_vcpu_list);
14509 spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
14510 vcpu->pre_pcpu = -1;
14511 }
14512}
14513
14514/*
14515 * This routine does the following things for vCPU which is going
14516 * to be blocked if VT-d PI is enabled.
14517 * - Store the vCPU to the wakeup list, so when interrupts happen
14518 * we can find the right vCPU to wake up.
14519 * - Change the Posted-interrupt descriptor as below:
14520 * 'NDST' <-- vcpu->pre_pcpu
14521 * 'NV' <-- POSTED_INTR_WAKEUP_VECTOR
14522 * - If 'ON' is set during this process, which means at least one
14523 * interrupt is posted for this vCPU, we cannot block it, in
14524 * this case, return 1, otherwise, return 0.
14525 *
14526 */
14527static int pi_pre_block(struct kvm_vcpu *vcpu)
14528{
14529 unsigned int dest;
14530 struct pi_desc old, new;
14531 struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
14532
14533 if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
14534 !irq_remapping_cap(IRQ_POSTING_CAP) ||
14535 !kvm_vcpu_apicv_active(vcpu))
14536 return 0;
14537
14538 WARN_ON(irqs_disabled());
14539 local_irq_disable();
14540 if (!WARN_ON_ONCE(vcpu->pre_pcpu != -1)) {
14541 vcpu->pre_pcpu = vcpu->cpu;
14542 spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
14543 list_add_tail(&vcpu->blocked_vcpu_list,
14544 &per_cpu(blocked_vcpu_on_cpu,
14545 vcpu->pre_pcpu));
14546 spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
14547 }
14548
14549 do {
14550 old.control = new.control = pi_desc->control;
14551
14552 WARN((pi_desc->sn == 1),
14553 "Warning: SN field of posted-interrupts "
14554 "is set before blocking\n");
14555
14556 /*
14557 * Since vCPU can be preempted during this process,
14558 * vcpu->cpu could be different with pre_pcpu, we
14559 * need to set pre_pcpu as the destination of wakeup
14560 * notification event, then we can find the right vCPU
14561 * to wakeup in wakeup handler if interrupts happen
14562 * when the vCPU is in blocked state.
14563 */
14564 dest = cpu_physical_id(vcpu->pre_pcpu);
14565
14566 if (x2apic_enabled())
14567 new.ndst = dest;
14568 else
14569 new.ndst = (dest << 8) & 0xFF00;
14570
14571 /* set 'NV' to 'wakeup vector' */
14572 new.nv = POSTED_INTR_WAKEUP_VECTOR;
14573 } while (cmpxchg64(&pi_desc->control, old.control,
14574 new.control) != old.control);
14575
14576 /* We should not block the vCPU if an interrupt is posted for it. */
14577 if (pi_test_on(pi_desc) == 1)
14578 __pi_post_block(vcpu);
14579
14580 local_irq_enable();
14581 return (vcpu->pre_pcpu == -1);
14582}
14583
14584static int vmx_pre_block(struct kvm_vcpu *vcpu)
14585{
14586 if (pi_pre_block(vcpu))
14587 return 1;
14588
14589 if (kvm_lapic_hv_timer_in_use(vcpu))
14590 kvm_lapic_switch_to_sw_timer(vcpu);
14591
14592 return 0;
14593}
14594
14595static void pi_post_block(struct kvm_vcpu *vcpu)
14596{
14597 if (vcpu->pre_pcpu == -1)
14598 return;
14599
14600 WARN_ON(irqs_disabled());
14601 local_irq_disable();
14602 __pi_post_block(vcpu);
14603 local_irq_enable();
14604}
14605
14606static void vmx_post_block(struct kvm_vcpu *vcpu)
14607{
14608 if (kvm_x86_ops->set_hv_timer)
14609 kvm_lapic_switch_to_hv_timer(vcpu);
14610
14611 pi_post_block(vcpu);
14612}
14613
14614/*
14615 * vmx_update_pi_irte - set IRTE for Posted-Interrupts
14616 *
14617 * @kvm: kvm
14618 * @host_irq: host irq of the interrupt
14619 * @guest_irq: gsi of the interrupt
14620 * @set: set or unset PI
14621 * returns 0 on success, < 0 on failure
14622 */
14623static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
14624 uint32_t guest_irq, bool set)
14625{
14626 struct kvm_kernel_irq_routing_entry *e;
14627 struct kvm_irq_routing_table *irq_rt;
14628 struct kvm_lapic_irq irq;
14629 struct kvm_vcpu *vcpu;
14630 struct vcpu_data vcpu_info;
14631 int idx, ret = 0;
14632
14633 if (!kvm_arch_has_assigned_device(kvm) ||
14634 !irq_remapping_cap(IRQ_POSTING_CAP) ||
14635 !kvm_vcpu_apicv_active(kvm->vcpus[0]))
14636 return 0;
14637
14638 idx = srcu_read_lock(&kvm->irq_srcu);
14639 irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
14640 if (guest_irq >= irq_rt->nr_rt_entries ||
14641 hlist_empty(&irq_rt->map[guest_irq])) {
14642 pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n",
14643 guest_irq, irq_rt->nr_rt_entries);
14644 goto out;
14645 }
14646
14647 hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
14648 if (e->type != KVM_IRQ_ROUTING_MSI)
14649 continue;
14650 /*
14651 * VT-d PI cannot support posting multicast/broadcast
14652 * interrupts to a vCPU, we still use interrupt remapping
14653 * for these kind of interrupts.
14654 *
14655 * For lowest-priority interrupts, we only support
14656 * those with single CPU as the destination, e.g. user
14657 * configures the interrupts via /proc/irq or uses
14658 * irqbalance to make the interrupts single-CPU.
14659 *
14660 * We will support full lowest-priority interrupt later.
14661 */
14662
14663 kvm_set_msi_irq(kvm, e, &irq);
14664 if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) {
14665 /*
14666 * Make sure the IRTE is in remapped mode if
14667 * we don't handle it in posted mode.
14668 */
14669 ret = irq_set_vcpu_affinity(host_irq, NULL);
14670 if (ret < 0) {
14671 printk(KERN_INFO
14672 "failed to back to remapped mode, irq: %u\n",
14673 host_irq);
14674 goto out;
14675 }
14676
14677 continue;
14678 }
14679
14680 vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu));
14681 vcpu_info.vector = irq.vector;
14682
14683 trace_kvm_pi_irte_update(host_irq, vcpu->vcpu_id, e->gsi,
14684 vcpu_info.vector, vcpu_info.pi_desc_addr, set);
14685
14686 if (set)
14687 ret = irq_set_vcpu_affinity(host_irq, &vcpu_info);
14688 else
14689 ret = irq_set_vcpu_affinity(host_irq, NULL);
14690
14691 if (ret < 0) {
14692 printk(KERN_INFO "%s: failed to update PI IRTE\n",
14693 __func__);
14694 goto out;
14695 }
14696 }
14697
14698 ret = 0;
14699out:
14700 srcu_read_unlock(&kvm->irq_srcu, idx);
14701 return ret;
14702}
14703
14704static void vmx_setup_mce(struct kvm_vcpu *vcpu)
14705{
14706 if (vcpu->arch.mcg_cap & MCG_LMCE_P)
14707 to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
14708 FEATURE_CONTROL_LMCE;
14709 else
14710 to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
14711 ~FEATURE_CONTROL_LMCE;
14712}
14713
14714static int vmx_smi_allowed(struct kvm_vcpu *vcpu)
14715{
14716 /* we need a nested vmexit to enter SMM, postpone if run is pending */
14717 if (to_vmx(vcpu)->nested.nested_run_pending)
14718 return 0;
14719 return 1;
14720}
14721
14722static int vmx_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
14723{
14724 struct vcpu_vmx *vmx = to_vmx(vcpu);
14725
14726 vmx->nested.smm.guest_mode = is_guest_mode(vcpu);
14727 if (vmx->nested.smm.guest_mode)
14728 nested_vmx_vmexit(vcpu, -1, 0, 0);
14729
14730 vmx->nested.smm.vmxon = vmx->nested.vmxon;
14731 vmx->nested.vmxon = false;
14732 vmx_clear_hlt(vcpu);
14733 return 0;
14734}
14735
14736static int vmx_pre_leave_smm(struct kvm_vcpu *vcpu, u64 smbase)
14737{
14738 struct vcpu_vmx *vmx = to_vmx(vcpu);
14739 int ret;
14740
14741 if (vmx->nested.smm.vmxon) {
14742 vmx->nested.vmxon = true;
14743 vmx->nested.smm.vmxon = false;
14744 }
14745
14746 if (vmx->nested.smm.guest_mode) {
14747 vcpu->arch.hflags &= ~HF_SMM_MASK;
14748 ret = nested_vmx_enter_non_root_mode(vcpu, false);
14749 vcpu->arch.hflags |= HF_SMM_MASK;
14750 if (ret)
14751 return ret;
14752
14753 vmx->nested.smm.guest_mode = false;
14754 }
14755 return 0;
14756}
14757
14758static int enable_smi_window(struct kvm_vcpu *vcpu)
14759{
14760 return 0;
14761}
14762
14763static inline int vmx_has_valid_vmcs12(struct kvm_vcpu *vcpu)
14764{
14765 struct vcpu_vmx *vmx = to_vmx(vcpu);
14766
14767 /*
14768 * In case we do two consecutive get/set_nested_state()s while L2 was
14769 * running hv_evmcs may end up not being mapped (we map it from
14770 * nested_vmx_run()/vmx_vcpu_run()). Check is_guest_mode() as we always
14771 * have vmcs12 if it is true.
14772 */
14773 return is_guest_mode(vcpu) || vmx->nested.current_vmptr != -1ull ||
14774 vmx->nested.hv_evmcs;
14775}
14776
14777static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
14778 struct kvm_nested_state __user *user_kvm_nested_state,
14779 u32 user_data_size)
14780{
14781 struct vcpu_vmx *vmx;
14782 struct vmcs12 *vmcs12;
14783 struct kvm_nested_state kvm_state = {
14784 .flags = 0,
14785 .format = 0,
14786 .size = sizeof(kvm_state),
14787 .vmx.vmxon_pa = -1ull,
14788 .vmx.vmcs_pa = -1ull,
14789 };
14790
14791 if (!vcpu)
14792 return kvm_state.size + 2 * VMCS12_SIZE;
14793
14794 vmx = to_vmx(vcpu);
14795 vmcs12 = get_vmcs12(vcpu);
14796
14797 if (nested_vmx_allowed(vcpu) && vmx->nested.enlightened_vmcs_enabled)
14798 kvm_state.flags |= KVM_STATE_NESTED_EVMCS;
14799
14800 if (nested_vmx_allowed(vcpu) &&
14801 (vmx->nested.vmxon || vmx->nested.smm.vmxon)) {
14802 kvm_state.vmx.vmxon_pa = vmx->nested.vmxon_ptr;
14803 kvm_state.vmx.vmcs_pa = vmx->nested.current_vmptr;
14804
14805 if (vmx_has_valid_vmcs12(vcpu)) {
14806 kvm_state.size += VMCS12_SIZE;
14807
14808 if (is_guest_mode(vcpu) &&
14809 nested_cpu_has_shadow_vmcs(vmcs12) &&
14810 vmcs12->vmcs_link_pointer != -1ull)
14811 kvm_state.size += VMCS12_SIZE;
14812 }
14813
14814 if (vmx->nested.smm.vmxon)
14815 kvm_state.vmx.smm.flags |= KVM_STATE_NESTED_SMM_VMXON;
14816
14817 if (vmx->nested.smm.guest_mode)
14818 kvm_state.vmx.smm.flags |= KVM_STATE_NESTED_SMM_GUEST_MODE;
14819
14820 if (is_guest_mode(vcpu)) {
14821 kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE;
14822
14823 if (vmx->nested.nested_run_pending)
14824 kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING;
14825 }
14826 }
14827
14828 if (user_data_size < kvm_state.size)
14829 goto out;
14830
14831 if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state)))
14832 return -EFAULT;
14833
14834 if (!vmx_has_valid_vmcs12(vcpu))
14835 goto out;
14836
14837 /*
14838 * When running L2, the authoritative vmcs12 state is in the
14839 * vmcs02. When running L1, the authoritative vmcs12 state is
14840 * in the shadow or enlightened vmcs linked to vmcs01, unless
14841 * need_vmcs12_sync is set, in which case, the authoritative
14842 * vmcs12 state is in the vmcs12 already.
14843 */
14844 if (is_guest_mode(vcpu)) {
14845 sync_vmcs12(vcpu, vmcs12);
14846 } else if (!vmx->nested.need_vmcs12_sync) {
14847 if (vmx->nested.hv_evmcs)
14848 copy_enlightened_to_vmcs12(vmx);
14849 else if (enable_shadow_vmcs)
14850 copy_shadow_to_vmcs12(vmx);
14851 }
14852
14853 if (copy_to_user(user_kvm_nested_state->data, vmcs12, sizeof(*vmcs12)))
14854 return -EFAULT;
14855
14856 if (nested_cpu_has_shadow_vmcs(vmcs12) &&
14857 vmcs12->vmcs_link_pointer != -1ull) {
14858 if (copy_to_user(user_kvm_nested_state->data + VMCS12_SIZE,
14859 get_shadow_vmcs12(vcpu), sizeof(*vmcs12)))
14860 return -EFAULT;
14861 }
14862
14863out:
14864 return kvm_state.size;
14865}
14866
14867static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
14868 struct kvm_nested_state __user *user_kvm_nested_state,
14869 struct kvm_nested_state *kvm_state)
14870{
14871 struct vcpu_vmx *vmx = to_vmx(vcpu);
14872 struct vmcs12 *vmcs12;
14873 u32 exit_qual;
14874 int ret;
14875
14876 if (kvm_state->format != 0)
14877 return -EINVAL;
14878
14879 if (kvm_state->flags & KVM_STATE_NESTED_EVMCS)
14880 nested_enable_evmcs(vcpu, NULL);
14881
14882 if (!nested_vmx_allowed(vcpu))
14883 return kvm_state->vmx.vmxon_pa == -1ull ? 0 : -EINVAL;
14884
14885 if (kvm_state->vmx.vmxon_pa == -1ull) {
14886 if (kvm_state->vmx.smm.flags)
14887 return -EINVAL;
14888
14889 if (kvm_state->vmx.vmcs_pa != -1ull)
14890 return -EINVAL;
14891
14892 vmx_leave_nested(vcpu);
14893 return 0;
14894 }
14895
14896 if (!page_address_valid(vcpu, kvm_state->vmx.vmxon_pa))
14897 return -EINVAL;
14898
14899 if ((kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) &&
14900 (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
14901 return -EINVAL;
14902
14903 if (kvm_state->vmx.smm.flags &
14904 ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON))
14905 return -EINVAL;
14906
14907 /*
14908 * SMM temporarily disables VMX, so we cannot be in guest mode,
14909 * nor can VMLAUNCH/VMRESUME be pending. Outside SMM, SMM flags
14910 * must be zero.
14911 */
14912 if (is_smm(vcpu) ? kvm_state->flags : kvm_state->vmx.smm.flags)
14913 return -EINVAL;
14914
14915 if ((kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) &&
14916 !(kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON))
14917 return -EINVAL;
14918
14919 vmx_leave_nested(vcpu);
14920 if (kvm_state->vmx.vmxon_pa == -1ull)
14921 return 0;
14922
14923 vmx->nested.vmxon_ptr = kvm_state->vmx.vmxon_pa;
14924 ret = enter_vmx_operation(vcpu);
14925 if (ret)
14926 return ret;
14927
14928 /* Empty 'VMXON' state is permitted */
14929 if (kvm_state->size < sizeof(kvm_state) + sizeof(*vmcs12))
14930 return 0;
14931
14932 if (kvm_state->vmx.vmcs_pa != -1ull) {
14933 if (kvm_state->vmx.vmcs_pa == kvm_state->vmx.vmxon_pa ||
14934 !page_address_valid(vcpu, kvm_state->vmx.vmcs_pa))
14935 return -EINVAL;
14936
14937 set_current_vmptr(vmx, kvm_state->vmx.vmcs_pa);
14938 } else if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) {
14939 /*
14940 * Sync eVMCS upon entry as we may not have
14941 * HV_X64_MSR_VP_ASSIST_PAGE set up yet.
14942 */
14943 vmx->nested.need_vmcs12_sync = true;
14944 } else {
14945 return -EINVAL;
14946 }
14947
14948 if (kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON) {
14949 vmx->nested.smm.vmxon = true;
14950 vmx->nested.vmxon = false;
14951
14952 if (kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE)
14953 vmx->nested.smm.guest_mode = true;
14954 }
14955
14956 vmcs12 = get_vmcs12(vcpu);
14957 if (copy_from_user(vmcs12, user_kvm_nested_state->data, sizeof(*vmcs12)))
14958 return -EFAULT;
14959
14960 if (vmcs12->hdr.revision_id != VMCS12_REVISION)
14961 return -EINVAL;
14962
14963 if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
14964 return 0;
14965
14966 vmx->nested.nested_run_pending =
14967 !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING);
14968
14969 if (nested_cpu_has_shadow_vmcs(vmcs12) &&
14970 vmcs12->vmcs_link_pointer != -1ull) {
14971 struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu);
14972 if (kvm_state->size < sizeof(kvm_state) + 2 * sizeof(*vmcs12))
14973 return -EINVAL;
14974
14975 if (copy_from_user(shadow_vmcs12,
14976 user_kvm_nested_state->data + VMCS12_SIZE,
14977 sizeof(*vmcs12)))
14978 return -EFAULT;
14979
14980 if (shadow_vmcs12->hdr.revision_id != VMCS12_REVISION ||
14981 !shadow_vmcs12->hdr.shadow_vmcs)
14982 return -EINVAL;
14983 }
14984
14985 if (check_vmentry_prereqs(vcpu, vmcs12) ||
14986 check_vmentry_postreqs(vcpu, vmcs12, &exit_qual))
14987 return -EINVAL;
14988
14989 vmx->nested.dirty_vmcs12 = true;
14990 ret = nested_vmx_enter_non_root_mode(vcpu, false);
14991 if (ret)
14992 return -EINVAL;
14993
14994 return 0;
14995}
14996
14997static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
14998 .cpu_has_kvm_support = cpu_has_kvm_support,
14999 .disabled_by_bios = vmx_disabled_by_bios,
15000 .hardware_setup = hardware_setup,
15001 .hardware_unsetup = hardware_unsetup,
15002 .check_processor_compatibility = vmx_check_processor_compat,
15003 .hardware_enable = hardware_enable,
15004 .hardware_disable = hardware_disable,
15005 .cpu_has_accelerated_tpr = report_flexpriority,
15006 .has_emulated_msr = vmx_has_emulated_msr,
15007
15008 .vm_init = vmx_vm_init,
15009 .vm_alloc = vmx_vm_alloc,
15010 .vm_free = vmx_vm_free,
15011
15012 .vcpu_create = vmx_create_vcpu,
15013 .vcpu_free = vmx_free_vcpu,
15014 .vcpu_reset = vmx_vcpu_reset,
15015
15016 .prepare_guest_switch = vmx_prepare_switch_to_guest,
15017 .vcpu_load = vmx_vcpu_load,
15018 .vcpu_put = vmx_vcpu_put,
15019
15020 .update_bp_intercept = update_exception_bitmap,
15021 .get_msr_feature = vmx_get_msr_feature,
15022 .get_msr = vmx_get_msr,
15023 .set_msr = vmx_set_msr,
15024 .get_segment_base = vmx_get_segment_base,
15025 .get_segment = vmx_get_segment,
15026 .set_segment = vmx_set_segment,
15027 .get_cpl = vmx_get_cpl,
15028 .get_cs_db_l_bits = vmx_get_cs_db_l_bits,
15029 .decache_cr0_guest_bits = vmx_decache_cr0_guest_bits,
15030 .decache_cr3 = vmx_decache_cr3,
15031 .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits,
15032 .set_cr0 = vmx_set_cr0,
15033 .set_cr3 = vmx_set_cr3,
15034 .set_cr4 = vmx_set_cr4,
15035 .set_efer = vmx_set_efer,
15036 .get_idt = vmx_get_idt,
15037 .set_idt = vmx_set_idt,
15038 .get_gdt = vmx_get_gdt,
15039 .set_gdt = vmx_set_gdt,
15040 .get_dr6 = vmx_get_dr6,
15041 .set_dr6 = vmx_set_dr6,
15042 .set_dr7 = vmx_set_dr7,
15043 .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs,
15044 .cache_reg = vmx_cache_reg,
15045 .get_rflags = vmx_get_rflags,
15046 .set_rflags = vmx_set_rflags,
15047
15048 .tlb_flush = vmx_flush_tlb,
15049 .tlb_flush_gva = vmx_flush_tlb_gva,
15050
15051 .run = vmx_vcpu_run,
15052 .handle_exit = vmx_handle_exit,
15053 .skip_emulated_instruction = skip_emulated_instruction,
15054 .set_interrupt_shadow = vmx_set_interrupt_shadow,
15055 .get_interrupt_shadow = vmx_get_interrupt_shadow,
15056 .patch_hypercall = vmx_patch_hypercall,
15057 .set_irq = vmx_inject_irq,
15058 .set_nmi = vmx_inject_nmi,
15059 .queue_exception = vmx_queue_exception,
15060 .cancel_injection = vmx_cancel_injection,
15061 .interrupt_allowed = vmx_interrupt_allowed,
15062 .nmi_allowed = vmx_nmi_allowed,
15063 .get_nmi_mask = vmx_get_nmi_mask,
15064 .set_nmi_mask = vmx_set_nmi_mask,
15065 .enable_nmi_window = enable_nmi_window,
15066 .enable_irq_window = enable_irq_window,
15067 .update_cr8_intercept = update_cr8_intercept,
15068 .set_virtual_apic_mode = vmx_set_virtual_apic_mode,
15069 .set_apic_access_page_addr = vmx_set_apic_access_page_addr,
15070 .get_enable_apicv = vmx_get_enable_apicv,
15071 .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl,
15072 .load_eoi_exitmap = vmx_load_eoi_exitmap,
15073 .apicv_post_state_restore = vmx_apicv_post_state_restore,
15074 .hwapic_irr_update = vmx_hwapic_irr_update,
15075 .hwapic_isr_update = vmx_hwapic_isr_update,
15076 .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt,
15077 .sync_pir_to_irr = vmx_sync_pir_to_irr,
15078 .deliver_posted_interrupt = vmx_deliver_posted_interrupt,
15079
15080 .set_tss_addr = vmx_set_tss_addr,
15081 .set_identity_map_addr = vmx_set_identity_map_addr,
15082 .get_tdp_level = get_ept_level,
15083 .get_mt_mask = vmx_get_mt_mask,
15084
15085 .get_exit_info = vmx_get_exit_info,
15086
15087 .get_lpage_level = vmx_get_lpage_level,
15088
15089 .cpuid_update = vmx_cpuid_update,
15090
15091 .rdtscp_supported = vmx_rdtscp_supported,
15092 .invpcid_supported = vmx_invpcid_supported,
15093
15094 .set_supported_cpuid = vmx_set_supported_cpuid,
15095
15096 .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
15097
15098 .read_l1_tsc_offset = vmx_read_l1_tsc_offset,
15099 .write_l1_tsc_offset = vmx_write_l1_tsc_offset,
15100
15101 .set_tdp_cr3 = vmx_set_cr3,
15102
15103 .check_intercept = vmx_check_intercept,
15104 .handle_external_intr = vmx_handle_external_intr,
15105 .mpx_supported = vmx_mpx_supported,
15106 .xsaves_supported = vmx_xsaves_supported,
15107 .umip_emulated = vmx_umip_emulated,
15108
15109 .check_nested_events = vmx_check_nested_events,
15110 .request_immediate_exit = vmx_request_immediate_exit,
15111
15112 .sched_in = vmx_sched_in,
15113
15114 .slot_enable_log_dirty = vmx_slot_enable_log_dirty,
15115 .slot_disable_log_dirty = vmx_slot_disable_log_dirty,
15116 .flush_log_dirty = vmx_flush_log_dirty,
15117 .enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked,
15118 .write_log_dirty = vmx_write_pml_buffer,
15119
15120 .pre_block = vmx_pre_block,
15121 .post_block = vmx_post_block,
15122
15123 .pmu_ops = &intel_pmu_ops,
15124
15125 .update_pi_irte = vmx_update_pi_irte,
15126
15127#ifdef CONFIG_X86_64
15128 .set_hv_timer = vmx_set_hv_timer,
15129 .cancel_hv_timer = vmx_cancel_hv_timer,
15130#endif
15131
15132 .setup_mce = vmx_setup_mce,
15133
15134 .get_nested_state = vmx_get_nested_state,
15135 .set_nested_state = vmx_set_nested_state,
15136 .get_vmcs12_pages = nested_get_vmcs12_pages,
15137
15138 .smi_allowed = vmx_smi_allowed,
15139 .pre_enter_smm = vmx_pre_enter_smm,
15140 .pre_leave_smm = vmx_pre_leave_smm,
15141 .enable_smi_window = enable_smi_window,
15142
15143 .nested_enable_evmcs = nested_enable_evmcs,
15144};
15145
15146static void vmx_cleanup_l1d_flush(void)
15147{
15148 if (vmx_l1d_flush_pages) {
15149 free_pages((unsigned long)vmx_l1d_flush_pages, L1D_CACHE_ORDER);
15150 vmx_l1d_flush_pages = NULL;
15151 }
15152 /* Restore state so sysfs ignores VMX */
15153 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO;
15154}
15155
15156static void vmx_exit(void)
15157{
15158#ifdef CONFIG_KEXEC_CORE
15159 RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL);
15160 synchronize_rcu();
15161#endif
15162
15163 kvm_exit();
15164
15165#if IS_ENABLED(CONFIG_HYPERV)
15166 if (static_branch_unlikely(&enable_evmcs)) {
15167 int cpu;
15168 struct hv_vp_assist_page *vp_ap;
15169 /*
15170 * Reset everything to support using non-enlightened VMCS
15171 * access later (e.g. when we reload the module with
15172 * enlightened_vmcs=0)
15173 */
15174 for_each_online_cpu(cpu) {
15175 vp_ap = hv_get_vp_assist_page(cpu);
15176
15177 if (!vp_ap)
15178 continue;
15179
15180 vp_ap->current_nested_vmcs = 0;
15181 vp_ap->enlighten_vmentry = 0;
15182 }
15183
15184 static_branch_disable(&enable_evmcs);
15185 }
15186#endif
15187 vmx_cleanup_l1d_flush();
15188}
15189module_exit(vmx_exit);
15190
15191static int __init vmx_init(void)
15192{
15193 int r;
15194
15195#if IS_ENABLED(CONFIG_HYPERV)
15196 /*
15197 * Enlightened VMCS usage should be recommended and the host needs
15198 * to support eVMCS v1 or above. We can also disable eVMCS support
15199 * with module parameter.
15200 */
15201 if (enlightened_vmcs &&
15202 ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED &&
15203 (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >=
15204 KVM_EVMCS_VERSION) {
15205 int cpu;
15206
15207 /* Check that we have assist pages on all online CPUs */
15208 for_each_online_cpu(cpu) {
15209 if (!hv_get_vp_assist_page(cpu)) {
15210 enlightened_vmcs = false;
15211 break;
15212 }
15213 }
15214
15215 if (enlightened_vmcs) {
15216 pr_info("KVM: vmx: using Hyper-V Enlightened VMCS\n");
15217 static_branch_enable(&enable_evmcs);
15218 }
15219 } else {
15220 enlightened_vmcs = false;
15221 }
15222#endif
15223
15224 r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
15225 __alignof__(struct vcpu_vmx), THIS_MODULE);
15226 if (r)
15227 return r;
15228
15229 /*
15230 * Must be called after kvm_init() so enable_ept is properly set
15231 * up. Hand the parameter mitigation value in which was stored in
15232 * the pre module init parser. If no parameter was given, it will
15233 * contain 'auto' which will be turned into the default 'cond'
15234 * mitigation mode.
15235 */
15236 if (boot_cpu_has(X86_BUG_L1TF)) {
15237 r = vmx_setup_l1d_flush(vmentry_l1d_flush_param);
15238 if (r) {
15239 vmx_exit();
15240 return r;
15241 }
15242 }
15243
15244#ifdef CONFIG_KEXEC_CORE
15245 rcu_assign_pointer(crash_vmclear_loaded_vmcss,
15246 crash_vmclear_local_loaded_vmcss);
15247#endif
15248 vmx_check_vmcs12_offsets();
15249
15250 return 0;
15251}
15252module_init(vmx_init);
diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h
new file mode 100644
index 000000000000..854e144131c6
--- /dev/null
+++ b/arch/x86/kvm/vmx/capabilities.h
@@ -0,0 +1,343 @@
1/* SPDX-License-Identifier: GPL-2.0 */
2#ifndef __KVM_X86_VMX_CAPS_H
3#define __KVM_X86_VMX_CAPS_H
4
5#include "lapic.h"
6
7extern bool __read_mostly enable_vpid;
8extern bool __read_mostly flexpriority_enabled;
9extern bool __read_mostly enable_ept;
10extern bool __read_mostly enable_unrestricted_guest;
11extern bool __read_mostly enable_ept_ad_bits;
12extern bool __read_mostly enable_pml;
13extern int __read_mostly pt_mode;
14
15#define PT_MODE_SYSTEM 0
16#define PT_MODE_HOST_GUEST 1
17
18struct nested_vmx_msrs {
19 /*
20 * We only store the "true" versions of the VMX capability MSRs. We
21 * generate the "non-true" versions by setting the must-be-1 bits
22 * according to the SDM.
23 */
24 u32 procbased_ctls_low;
25 u32 procbased_ctls_high;
26 u32 secondary_ctls_low;
27 u32 secondary_ctls_high;
28 u32 pinbased_ctls_low;
29 u32 pinbased_ctls_high;
30 u32 exit_ctls_low;
31 u32 exit_ctls_high;
32 u32 entry_ctls_low;
33 u32 entry_ctls_high;
34 u32 misc_low;
35 u32 misc_high;
36 u32 ept_caps;
37 u32 vpid_caps;
38 u64 basic;
39 u64 cr0_fixed0;
40 u64 cr0_fixed1;
41 u64 cr4_fixed0;
42 u64 cr4_fixed1;
43 u64 vmcs_enum;
44 u64 vmfunc_controls;
45};
46
47struct vmcs_config {
48 int size;
49 int order;
50 u32 basic_cap;
51 u32 revision_id;
52 u32 pin_based_exec_ctrl;
53 u32 cpu_based_exec_ctrl;
54 u32 cpu_based_2nd_exec_ctrl;
55 u32 vmexit_ctrl;
56 u32 vmentry_ctrl;
57 struct nested_vmx_msrs nested;
58};
59extern struct vmcs_config vmcs_config;
60
61struct vmx_capability {
62 u32 ept;
63 u32 vpid;
64};
65extern struct vmx_capability vmx_capability;
66
67static inline bool cpu_has_vmx_basic_inout(void)
68{
69 return (((u64)vmcs_config.basic_cap << 32) & VMX_BASIC_INOUT);
70}
71
72static inline bool cpu_has_virtual_nmis(void)
73{
74 return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
75}
76
77static inline bool cpu_has_vmx_preemption_timer(void)
78{
79 return vmcs_config.pin_based_exec_ctrl &
80 PIN_BASED_VMX_PREEMPTION_TIMER;
81}
82
83static inline bool cpu_has_vmx_posted_intr(void)
84{
85 return IS_ENABLED(CONFIG_X86_LOCAL_APIC) &&
86 vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR;
87}
88
89static inline bool cpu_has_load_ia32_efer(void)
90{
91 return (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_EFER) &&
92 (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_EFER);
93}
94
95static inline bool cpu_has_load_perf_global_ctrl(void)
96{
97 return (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) &&
98 (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL);
99}
100
101static inline bool vmx_mpx_supported(void)
102{
103 return (vmcs_config.vmexit_ctrl & VM_EXIT_CLEAR_BNDCFGS) &&
104 (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_BNDCFGS);
105}
106
107static inline bool cpu_has_vmx_tpr_shadow(void)
108{
109 return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW;
110}
111
112static inline bool cpu_need_tpr_shadow(struct kvm_vcpu *vcpu)
113{
114 return cpu_has_vmx_tpr_shadow() && lapic_in_kernel(vcpu);
115}
116
117static inline bool cpu_has_vmx_msr_bitmap(void)
118{
119 return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS;
120}
121
122static inline bool cpu_has_secondary_exec_ctrls(void)
123{
124 return vmcs_config.cpu_based_exec_ctrl &
125 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
126}
127
128static inline bool cpu_has_vmx_virtualize_apic_accesses(void)
129{
130 return vmcs_config.cpu_based_2nd_exec_ctrl &
131 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
132}
133
134static inline bool cpu_has_vmx_ept(void)
135{
136 return vmcs_config.cpu_based_2nd_exec_ctrl &
137 SECONDARY_EXEC_ENABLE_EPT;
138}
139
140static inline bool vmx_umip_emulated(void)
141{
142 return vmcs_config.cpu_based_2nd_exec_ctrl &
143 SECONDARY_EXEC_DESC;
144}
145
146static inline bool cpu_has_vmx_rdtscp(void)
147{
148 return vmcs_config.cpu_based_2nd_exec_ctrl &
149 SECONDARY_EXEC_RDTSCP;
150}
151
152static inline bool cpu_has_vmx_virtualize_x2apic_mode(void)
153{
154 return vmcs_config.cpu_based_2nd_exec_ctrl &
155 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
156}
157
158static inline bool cpu_has_vmx_vpid(void)
159{
160 return vmcs_config.cpu_based_2nd_exec_ctrl &
161 SECONDARY_EXEC_ENABLE_VPID;
162}
163
164static inline bool cpu_has_vmx_wbinvd_exit(void)
165{
166 return vmcs_config.cpu_based_2nd_exec_ctrl &
167 SECONDARY_EXEC_WBINVD_EXITING;
168}
169
170static inline bool cpu_has_vmx_unrestricted_guest(void)
171{
172 return vmcs_config.cpu_based_2nd_exec_ctrl &
173 SECONDARY_EXEC_UNRESTRICTED_GUEST;
174}
175
176static inline bool cpu_has_vmx_apic_register_virt(void)
177{
178 return vmcs_config.cpu_based_2nd_exec_ctrl &
179 SECONDARY_EXEC_APIC_REGISTER_VIRT;
180}
181
182static inline bool cpu_has_vmx_virtual_intr_delivery(void)
183{
184 return vmcs_config.cpu_based_2nd_exec_ctrl &
185 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
186}
187
188static inline bool cpu_has_vmx_ple(void)
189{
190 return vmcs_config.cpu_based_2nd_exec_ctrl &
191 SECONDARY_EXEC_PAUSE_LOOP_EXITING;
192}
193
194static inline bool vmx_rdrand_supported(void)
195{
196 return vmcs_config.cpu_based_2nd_exec_ctrl &
197 SECONDARY_EXEC_RDRAND_EXITING;
198}
199
200static inline bool cpu_has_vmx_invpcid(void)
201{
202 return vmcs_config.cpu_based_2nd_exec_ctrl &
203 SECONDARY_EXEC_ENABLE_INVPCID;
204}
205
206static inline bool cpu_has_vmx_vmfunc(void)
207{
208 return vmcs_config.cpu_based_2nd_exec_ctrl &
209 SECONDARY_EXEC_ENABLE_VMFUNC;
210}
211
212static inline bool cpu_has_vmx_shadow_vmcs(void)
213{
214 u64 vmx_msr;
215
216 /* check if the cpu supports writing r/o exit information fields */
217 rdmsrl(MSR_IA32_VMX_MISC, vmx_msr);
218 if (!(vmx_msr & MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS))
219 return false;
220
221 return vmcs_config.cpu_based_2nd_exec_ctrl &
222 SECONDARY_EXEC_SHADOW_VMCS;
223}
224
225static inline bool cpu_has_vmx_encls_vmexit(void)
226{
227 return vmcs_config.cpu_based_2nd_exec_ctrl &
228 SECONDARY_EXEC_ENCLS_EXITING;
229}
230
231static inline bool vmx_rdseed_supported(void)
232{
233 return vmcs_config.cpu_based_2nd_exec_ctrl &
234 SECONDARY_EXEC_RDSEED_EXITING;
235}
236
237static inline bool cpu_has_vmx_pml(void)
238{
239 return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_ENABLE_PML;
240}
241
242static inline bool vmx_xsaves_supported(void)
243{
244 return vmcs_config.cpu_based_2nd_exec_ctrl &
245 SECONDARY_EXEC_XSAVES;
246}
247
248static inline bool cpu_has_vmx_tsc_scaling(void)
249{
250 return vmcs_config.cpu_based_2nd_exec_ctrl &
251 SECONDARY_EXEC_TSC_SCALING;
252}
253
254static inline bool cpu_has_vmx_apicv(void)
255{
256 return cpu_has_vmx_apic_register_virt() &&
257 cpu_has_vmx_virtual_intr_delivery() &&
258 cpu_has_vmx_posted_intr();
259}
260
261static inline bool cpu_has_vmx_flexpriority(void)
262{
263 return cpu_has_vmx_tpr_shadow() &&
264 cpu_has_vmx_virtualize_apic_accesses();
265}
266
267static inline bool cpu_has_vmx_ept_execute_only(void)
268{
269 return vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT;
270}
271
272static inline bool cpu_has_vmx_ept_4levels(void)
273{
274 return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT;
275}
276
277static inline bool cpu_has_vmx_ept_5levels(void)
278{
279 return vmx_capability.ept & VMX_EPT_PAGE_WALK_5_BIT;
280}
281
282static inline bool cpu_has_vmx_ept_mt_wb(void)
283{
284 return vmx_capability.ept & VMX_EPTP_WB_BIT;
285}
286
287static inline bool cpu_has_vmx_ept_2m_page(void)
288{
289 return vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT;
290}
291
292static inline bool cpu_has_vmx_ept_1g_page(void)
293{
294 return vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT;
295}
296
297static inline bool cpu_has_vmx_ept_ad_bits(void)
298{
299 return vmx_capability.ept & VMX_EPT_AD_BIT;
300}
301
302static inline bool cpu_has_vmx_invept_context(void)
303{
304 return vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT;
305}
306
307static inline bool cpu_has_vmx_invept_global(void)
308{
309 return vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT;
310}
311
312static inline bool cpu_has_vmx_invvpid(void)
313{
314 return vmx_capability.vpid & VMX_VPID_INVVPID_BIT;
315}
316
317static inline bool cpu_has_vmx_invvpid_individual_addr(void)
318{
319 return vmx_capability.vpid & VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT;
320}
321
322static inline bool cpu_has_vmx_invvpid_single(void)
323{
324 return vmx_capability.vpid & VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT;
325}
326
327static inline bool cpu_has_vmx_invvpid_global(void)
328{
329 return vmx_capability.vpid & VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT;
330}
331
332static inline bool cpu_has_vmx_intel_pt(void)
333{
334 u64 vmx_msr;
335
336 rdmsrl(MSR_IA32_VMX_MISC, vmx_msr);
337 return (vmx_msr & MSR_IA32_VMX_MISC_INTEL_PT) &&
338 (vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_PT_USE_GPA) &&
339 (vmcs_config.vmexit_ctrl & VM_EXIT_CLEAR_IA32_RTIT_CTL) &&
340 (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_RTIT_CTL);
341}
342
343#endif /* __KVM_X86_VMX_CAPS_H */
diff --git a/arch/x86/kvm/vmx_evmcs.h b/arch/x86/kvm/vmx/evmcs.c
index 210a884090ad..95bc2247478d 100644
--- a/arch/x86/kvm/vmx_evmcs.h
+++ b/arch/x86/kvm/vmx/evmcs.c
@@ -1,20 +1,22 @@
1/* SPDX-License-Identifier: GPL-2.0 */ 1// SPDX-License-Identifier: GPL-2.0
2#ifndef __KVM_X86_VMX_EVMCS_H
3#define __KVM_X86_VMX_EVMCS_H
4 2
5#include <asm/hyperv-tlfs.h> 3#include <linux/errno.h>
4#include <linux/smp.h>
5
6#include "evmcs.h"
7#include "vmcs.h"
8#include "vmx.h"
9
10DEFINE_STATIC_KEY_FALSE(enable_evmcs);
11
12#if IS_ENABLED(CONFIG_HYPERV)
6 13
7#define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n))))) 14#define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n)))))
8#define EVMCS1_OFFSET(x) offsetof(struct hv_enlightened_vmcs, x) 15#define EVMCS1_OFFSET(x) offsetof(struct hv_enlightened_vmcs, x)
9#define EVMCS1_FIELD(number, name, clean_field)[ROL16(number, 6)] = \ 16#define EVMCS1_FIELD(number, name, clean_field)[ROL16(number, 6)] = \
10 {EVMCS1_OFFSET(name), clean_field} 17 {EVMCS1_OFFSET(name), clean_field}
11 18
12struct evmcs_field { 19const struct evmcs_field vmcs_field_to_evmcs_1[] = {
13 u16 offset;
14 u16 clean_field;
15};
16
17static const struct evmcs_field vmcs_field_to_evmcs_1[] = {
18 /* 64 bit rw */ 20 /* 64 bit rw */
19 EVMCS1_FIELD(GUEST_RIP, guest_rip, 21 EVMCS1_FIELD(GUEST_RIP, guest_rip,
20 HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), 22 HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
@@ -298,27 +300,53 @@ static const struct evmcs_field vmcs_field_to_evmcs_1[] = {
298 EVMCS1_FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id, 300 EVMCS1_FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id,
299 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT), 301 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT),
300}; 302};
303const unsigned int nr_evmcs_1_fields = ARRAY_SIZE(vmcs_field_to_evmcs_1);
301 304
302static __always_inline int get_evmcs_offset(unsigned long field, 305void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf)
303 u16 *clean_field)
304{ 306{
305 unsigned int index = ROL16(field, 6); 307 vmcs_conf->pin_based_exec_ctrl &= ~EVMCS1_UNSUPPORTED_PINCTRL;
306 const struct evmcs_field *evmcs_field; 308 vmcs_conf->cpu_based_2nd_exec_ctrl &= ~EVMCS1_UNSUPPORTED_2NDEXEC;
307 309
308 if (unlikely(index >= ARRAY_SIZE(vmcs_field_to_evmcs_1))) { 310 vmcs_conf->vmexit_ctrl &= ~EVMCS1_UNSUPPORTED_VMEXIT_CTRL;
309 WARN_ONCE(1, "KVM: accessing unsupported EVMCS field %lx\n", 311 vmcs_conf->vmentry_ctrl &= ~EVMCS1_UNSUPPORTED_VMENTRY_CTRL;
310 field);
311 return -ENOENT;
312 }
313 312
314 evmcs_field = &vmcs_field_to_evmcs_1[index]; 313}
314#endif
315 315
316 if (clean_field) 316uint16_t nested_get_evmcs_version(struct kvm_vcpu *vcpu)
317 *clean_field = evmcs_field->clean_field; 317{
318 struct vcpu_vmx *vmx = to_vmx(vcpu);
319 /*
320 * vmcs_version represents the range of supported Enlightened VMCS
321 * versions: lower 8 bits is the minimal version, higher 8 bits is the
322 * maximum supported version. KVM supports versions from 1 to
323 * KVM_EVMCS_VERSION.
324 */
325 if (vmx->nested.enlightened_vmcs_enabled)
326 return (KVM_EVMCS_VERSION << 8) | 1;
318 327
319 return evmcs_field->offset; 328 return 0;
320} 329}
321 330
322#undef ROL16 331int nested_enable_evmcs(struct kvm_vcpu *vcpu,
332 uint16_t *vmcs_version)
333{
334 struct vcpu_vmx *vmx = to_vmx(vcpu);
335
336 if (vmcs_version)
337 *vmcs_version = nested_get_evmcs_version(vcpu);
338
339 /* We don't support disabling the feature for simplicity. */
340 if (vmx->nested.enlightened_vmcs_enabled)
341 return 0;
323 342
324#endif /* __KVM_X86_VMX_EVMCS_H */ 343 vmx->nested.enlightened_vmcs_enabled = true;
344
345 vmx->nested.msrs.pinbased_ctls_high &= ~EVMCS1_UNSUPPORTED_PINCTRL;
346 vmx->nested.msrs.entry_ctls_high &= ~EVMCS1_UNSUPPORTED_VMENTRY_CTRL;
347 vmx->nested.msrs.exit_ctls_high &= ~EVMCS1_UNSUPPORTED_VMEXIT_CTRL;
348 vmx->nested.msrs.secondary_ctls_high &= ~EVMCS1_UNSUPPORTED_2NDEXEC;
349 vmx->nested.msrs.vmfunc_controls &= ~EVMCS1_UNSUPPORTED_VMFUNC;
350
351 return 0;
352}
diff --git a/arch/x86/kvm/vmx/evmcs.h b/arch/x86/kvm/vmx/evmcs.h
new file mode 100644
index 000000000000..e0fcef85b332
--- /dev/null
+++ b/arch/x86/kvm/vmx/evmcs.h
@@ -0,0 +1,202 @@
1/* SPDX-License-Identifier: GPL-2.0 */
2#ifndef __KVM_X86_VMX_EVMCS_H
3#define __KVM_X86_VMX_EVMCS_H
4
5#include <linux/jump_label.h>
6
7#include <asm/hyperv-tlfs.h>
8#include <asm/mshyperv.h>
9#include <asm/vmx.h>
10
11#include "capabilities.h"
12#include "vmcs.h"
13
14struct vmcs_config;
15
16DECLARE_STATIC_KEY_FALSE(enable_evmcs);
17
18#define current_evmcs ((struct hv_enlightened_vmcs *)this_cpu_read(current_vmcs))
19
20#define KVM_EVMCS_VERSION 1
21
22/*
23 * Enlightened VMCSv1 doesn't support these:
24 *
25 * POSTED_INTR_NV = 0x00000002,
26 * GUEST_INTR_STATUS = 0x00000810,
27 * APIC_ACCESS_ADDR = 0x00002014,
28 * POSTED_INTR_DESC_ADDR = 0x00002016,
29 * EOI_EXIT_BITMAP0 = 0x0000201c,
30 * EOI_EXIT_BITMAP1 = 0x0000201e,
31 * EOI_EXIT_BITMAP2 = 0x00002020,
32 * EOI_EXIT_BITMAP3 = 0x00002022,
33 * GUEST_PML_INDEX = 0x00000812,
34 * PML_ADDRESS = 0x0000200e,
35 * VM_FUNCTION_CONTROL = 0x00002018,
36 * EPTP_LIST_ADDRESS = 0x00002024,
37 * VMREAD_BITMAP = 0x00002026,
38 * VMWRITE_BITMAP = 0x00002028,
39 *
40 * TSC_MULTIPLIER = 0x00002032,
41 * PLE_GAP = 0x00004020,
42 * PLE_WINDOW = 0x00004022,
43 * VMX_PREEMPTION_TIMER_VALUE = 0x0000482E,
44 * GUEST_IA32_PERF_GLOBAL_CTRL = 0x00002808,
45 * HOST_IA32_PERF_GLOBAL_CTRL = 0x00002c04,
46 *
47 * Currently unsupported in KVM:
48 * GUEST_IA32_RTIT_CTL = 0x00002814,
49 */
50#define EVMCS1_UNSUPPORTED_PINCTRL (PIN_BASED_POSTED_INTR | \
51 PIN_BASED_VMX_PREEMPTION_TIMER)
52#define EVMCS1_UNSUPPORTED_2NDEXEC \
53 (SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | \
54 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | \
55 SECONDARY_EXEC_APIC_REGISTER_VIRT | \
56 SECONDARY_EXEC_ENABLE_PML | \
57 SECONDARY_EXEC_ENABLE_VMFUNC | \
58 SECONDARY_EXEC_SHADOW_VMCS | \
59 SECONDARY_EXEC_TSC_SCALING | \
60 SECONDARY_EXEC_PAUSE_LOOP_EXITING)
61#define EVMCS1_UNSUPPORTED_VMEXIT_CTRL (VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
62#define EVMCS1_UNSUPPORTED_VMENTRY_CTRL (VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
63#define EVMCS1_UNSUPPORTED_VMFUNC (VMX_VMFUNC_EPTP_SWITCHING)
64
65#if IS_ENABLED(CONFIG_HYPERV)
66
67struct evmcs_field {
68 u16 offset;
69 u16 clean_field;
70};
71
72extern const struct evmcs_field vmcs_field_to_evmcs_1[];
73extern const unsigned int nr_evmcs_1_fields;
74
75#define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n)))))
76
77static __always_inline int get_evmcs_offset(unsigned long field,
78 u16 *clean_field)
79{
80 unsigned int index = ROL16(field, 6);
81 const struct evmcs_field *evmcs_field;
82
83 if (unlikely(index >= nr_evmcs_1_fields)) {
84 WARN_ONCE(1, "KVM: accessing unsupported EVMCS field %lx\n",
85 field);
86 return -ENOENT;
87 }
88
89 evmcs_field = &vmcs_field_to_evmcs_1[index];
90
91 if (clean_field)
92 *clean_field = evmcs_field->clean_field;
93
94 return evmcs_field->offset;
95}
96
97#undef ROL16
98
99static inline void evmcs_write64(unsigned long field, u64 value)
100{
101 u16 clean_field;
102 int offset = get_evmcs_offset(field, &clean_field);
103
104 if (offset < 0)
105 return;
106
107 *(u64 *)((char *)current_evmcs + offset) = value;
108
109 current_evmcs->hv_clean_fields &= ~clean_field;
110}
111
112static inline void evmcs_write32(unsigned long field, u32 value)
113{
114 u16 clean_field;
115 int offset = get_evmcs_offset(field, &clean_field);
116
117 if (offset < 0)
118 return;
119
120 *(u32 *)((char *)current_evmcs + offset) = value;
121 current_evmcs->hv_clean_fields &= ~clean_field;
122}
123
124static inline void evmcs_write16(unsigned long field, u16 value)
125{
126 u16 clean_field;
127 int offset = get_evmcs_offset(field, &clean_field);
128
129 if (offset < 0)
130 return;
131
132 *(u16 *)((char *)current_evmcs + offset) = value;
133 current_evmcs->hv_clean_fields &= ~clean_field;
134}
135
136static inline u64 evmcs_read64(unsigned long field)
137{
138 int offset = get_evmcs_offset(field, NULL);
139
140 if (offset < 0)
141 return 0;
142
143 return *(u64 *)((char *)current_evmcs + offset);
144}
145
146static inline u32 evmcs_read32(unsigned long field)
147{
148 int offset = get_evmcs_offset(field, NULL);
149
150 if (offset < 0)
151 return 0;
152
153 return *(u32 *)((char *)current_evmcs + offset);
154}
155
156static inline u16 evmcs_read16(unsigned long field)
157{
158 int offset = get_evmcs_offset(field, NULL);
159
160 if (offset < 0)
161 return 0;
162
163 return *(u16 *)((char *)current_evmcs + offset);
164}
165
166static inline void evmcs_touch_msr_bitmap(void)
167{
168 if (unlikely(!current_evmcs))
169 return;
170
171 if (current_evmcs->hv_enlightenments_control.msr_bitmap)
172 current_evmcs->hv_clean_fields &=
173 ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP;
174}
175
176static inline void evmcs_load(u64 phys_addr)
177{
178 struct hv_vp_assist_page *vp_ap =
179 hv_get_vp_assist_page(smp_processor_id());
180
181 vp_ap->current_nested_vmcs = phys_addr;
182 vp_ap->enlighten_vmentry = 1;
183}
184
185void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf);
186#else /* !IS_ENABLED(CONFIG_HYPERV) */
187static inline void evmcs_write64(unsigned long field, u64 value) {}
188static inline void evmcs_write32(unsigned long field, u32 value) {}
189static inline void evmcs_write16(unsigned long field, u16 value) {}
190static inline u64 evmcs_read64(unsigned long field) { return 0; }
191static inline u32 evmcs_read32(unsigned long field) { return 0; }
192static inline u16 evmcs_read16(unsigned long field) { return 0; }
193static inline void evmcs_load(u64 phys_addr) {}
194static inline void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf) {}
195static inline void evmcs_touch_msr_bitmap(void) {}
196#endif /* IS_ENABLED(CONFIG_HYPERV) */
197
198uint16_t nested_get_evmcs_version(struct kvm_vcpu *vcpu);
199int nested_enable_evmcs(struct kvm_vcpu *vcpu,
200 uint16_t *vmcs_version);
201
202#endif /* __KVM_X86_VMX_EVMCS_H */
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
new file mode 100644
index 000000000000..3170e291215d
--- /dev/null
+++ b/arch/x86/kvm/vmx/nested.c
@@ -0,0 +1,5721 @@
1// SPDX-License-Identifier: GPL-2.0
2
3#include <linux/frame.h>
4#include <linux/percpu.h>
5
6#include <asm/debugreg.h>
7#include <asm/mmu_context.h>
8
9#include "cpuid.h"
10#include "hyperv.h"
11#include "mmu.h"
12#include "nested.h"
13#include "trace.h"
14#include "x86.h"
15
16static bool __read_mostly enable_shadow_vmcs = 1;
17module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO);
18
19static bool __read_mostly nested_early_check = 0;
20module_param(nested_early_check, bool, S_IRUGO);
21
22/*
23 * Hyper-V requires all of these, so mark them as supported even though
24 * they are just treated the same as all-context.
25 */
26#define VMX_VPID_EXTENT_SUPPORTED_MASK \
27 (VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT | \
28 VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT | \
29 VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT | \
30 VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT)
31
32#define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5
33
34enum {
35 VMX_VMREAD_BITMAP,
36 VMX_VMWRITE_BITMAP,
37 VMX_BITMAP_NR
38};
39static unsigned long *vmx_bitmap[VMX_BITMAP_NR];
40
41#define vmx_vmread_bitmap (vmx_bitmap[VMX_VMREAD_BITMAP])
42#define vmx_vmwrite_bitmap (vmx_bitmap[VMX_VMWRITE_BITMAP])
43
44static u16 shadow_read_only_fields[] = {
45#define SHADOW_FIELD_RO(x) x,
46#include "vmcs_shadow_fields.h"
47};
48static int max_shadow_read_only_fields =
49 ARRAY_SIZE(shadow_read_only_fields);
50
51static u16 shadow_read_write_fields[] = {
52#define SHADOW_FIELD_RW(x) x,
53#include "vmcs_shadow_fields.h"
54};
55static int max_shadow_read_write_fields =
56 ARRAY_SIZE(shadow_read_write_fields);
57
58void init_vmcs_shadow_fields(void)
59{
60 int i, j;
61
62 memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE);
63 memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE);
64
65 for (i = j = 0; i < max_shadow_read_only_fields; i++) {
66 u16 field = shadow_read_only_fields[i];
67
68 if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 &&
69 (i + 1 == max_shadow_read_only_fields ||
70 shadow_read_only_fields[i + 1] != field + 1))
71 pr_err("Missing field from shadow_read_only_field %x\n",
72 field + 1);
73
74 clear_bit(field, vmx_vmread_bitmap);
75#ifdef CONFIG_X86_64
76 if (field & 1)
77 continue;
78#endif
79 if (j < i)
80 shadow_read_only_fields[j] = field;
81 j++;
82 }
83 max_shadow_read_only_fields = j;
84
85 for (i = j = 0; i < max_shadow_read_write_fields; i++) {
86 u16 field = shadow_read_write_fields[i];
87
88 if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 &&
89 (i + 1 == max_shadow_read_write_fields ||
90 shadow_read_write_fields[i + 1] != field + 1))
91 pr_err("Missing field from shadow_read_write_field %x\n",
92 field + 1);
93
94 /*
95 * PML and the preemption timer can be emulated, but the
96 * processor cannot vmwrite to fields that don't exist
97 * on bare metal.
98 */
99 switch (field) {
100 case GUEST_PML_INDEX:
101 if (!cpu_has_vmx_pml())
102 continue;
103 break;
104 case VMX_PREEMPTION_TIMER_VALUE:
105 if (!cpu_has_vmx_preemption_timer())
106 continue;
107 break;
108 case GUEST_INTR_STATUS:
109 if (!cpu_has_vmx_apicv())
110 continue;
111 break;
112 default:
113 break;
114 }
115
116 clear_bit(field, vmx_vmwrite_bitmap);
117 clear_bit(field, vmx_vmread_bitmap);
118#ifdef CONFIG_X86_64
119 if (field & 1)
120 continue;
121#endif
122 if (j < i)
123 shadow_read_write_fields[j] = field;
124 j++;
125 }
126 max_shadow_read_write_fields = j;
127}
128
129/*
130 * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
131 * set the success or error code of an emulated VMX instruction (as specified
132 * by Vol 2B, VMX Instruction Reference, "Conventions"), and skip the emulated
133 * instruction.
134 */
135static int nested_vmx_succeed(struct kvm_vcpu *vcpu)
136{
137 vmx_set_rflags(vcpu, vmx_get_rflags(vcpu)
138 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
139 X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF));
140 return kvm_skip_emulated_instruction(vcpu);
141}
142
143static int nested_vmx_failInvalid(struct kvm_vcpu *vcpu)
144{
145 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
146 & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF |
147 X86_EFLAGS_SF | X86_EFLAGS_OF))
148 | X86_EFLAGS_CF);
149 return kvm_skip_emulated_instruction(vcpu);
150}
151
152static int nested_vmx_failValid(struct kvm_vcpu *vcpu,
153 u32 vm_instruction_error)
154{
155 struct vcpu_vmx *vmx = to_vmx(vcpu);
156
157 /*
158 * failValid writes the error number to the current VMCS, which
159 * can't be done if there isn't a current VMCS.
160 */
161 if (vmx->nested.current_vmptr == -1ull && !vmx->nested.hv_evmcs)
162 return nested_vmx_failInvalid(vcpu);
163
164 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
165 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
166 X86_EFLAGS_SF | X86_EFLAGS_OF))
167 | X86_EFLAGS_ZF);
168 get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error;
169 /*
170 * We don't need to force a shadow sync because
171 * VM_INSTRUCTION_ERROR is not shadowed
172 */
173 return kvm_skip_emulated_instruction(vcpu);
174}
175
176static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator)
177{
178 /* TODO: not to reset guest simply here. */
179 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
180 pr_debug_ratelimited("kvm: nested vmx abort, indicator %d\n", indicator);
181}
182
183static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx)
184{
185 vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, SECONDARY_EXEC_SHADOW_VMCS);
186 vmcs_write64(VMCS_LINK_POINTER, -1ull);
187}
188
189static inline void nested_release_evmcs(struct kvm_vcpu *vcpu)
190{
191 struct vcpu_vmx *vmx = to_vmx(vcpu);
192
193 if (!vmx->nested.hv_evmcs)
194 return;
195
196 kunmap(vmx->nested.hv_evmcs_page);
197 kvm_release_page_dirty(vmx->nested.hv_evmcs_page);
198 vmx->nested.hv_evmcs_vmptr = -1ull;
199 vmx->nested.hv_evmcs_page = NULL;
200 vmx->nested.hv_evmcs = NULL;
201}
202
203/*
204 * Free whatever needs to be freed from vmx->nested when L1 goes down, or
205 * just stops using VMX.
206 */
207static void free_nested(struct kvm_vcpu *vcpu)
208{
209 struct vcpu_vmx *vmx = to_vmx(vcpu);
210
211 if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon)
212 return;
213
214 vmx->nested.vmxon = false;
215 vmx->nested.smm.vmxon = false;
216 free_vpid(vmx->nested.vpid02);
217 vmx->nested.posted_intr_nv = -1;
218 vmx->nested.current_vmptr = -1ull;
219 if (enable_shadow_vmcs) {
220 vmx_disable_shadow_vmcs(vmx);
221 vmcs_clear(vmx->vmcs01.shadow_vmcs);
222 free_vmcs(vmx->vmcs01.shadow_vmcs);
223 vmx->vmcs01.shadow_vmcs = NULL;
224 }
225 kfree(vmx->nested.cached_vmcs12);
226 kfree(vmx->nested.cached_shadow_vmcs12);
227 /* Unpin physical memory we referred to in the vmcs02 */
228 if (vmx->nested.apic_access_page) {
229 kvm_release_page_dirty(vmx->nested.apic_access_page);
230 vmx->nested.apic_access_page = NULL;
231 }
232 if (vmx->nested.virtual_apic_page) {
233 kvm_release_page_dirty(vmx->nested.virtual_apic_page);
234 vmx->nested.virtual_apic_page = NULL;
235 }
236 if (vmx->nested.pi_desc_page) {
237 kunmap(vmx->nested.pi_desc_page);
238 kvm_release_page_dirty(vmx->nested.pi_desc_page);
239 vmx->nested.pi_desc_page = NULL;
240 vmx->nested.pi_desc = NULL;
241 }
242
243 kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
244
245 nested_release_evmcs(vcpu);
246
247 free_loaded_vmcs(&vmx->nested.vmcs02);
248}
249
250static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
251{
252 struct vcpu_vmx *vmx = to_vmx(vcpu);
253 int cpu;
254
255 if (vmx->loaded_vmcs == vmcs)
256 return;
257
258 cpu = get_cpu();
259 vmx_vcpu_put(vcpu);
260 vmx->loaded_vmcs = vmcs;
261 vmx_vcpu_load(vcpu, cpu);
262 put_cpu();
263
264 vm_entry_controls_reset_shadow(vmx);
265 vm_exit_controls_reset_shadow(vmx);
266 vmx_segment_cache_clear(vmx);
267}
268
269/*
270 * Ensure that the current vmcs of the logical processor is the
271 * vmcs01 of the vcpu before calling free_nested().
272 */
273void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu)
274{
275 vcpu_load(vcpu);
276 vmx_switch_vmcs(vcpu, &to_vmx(vcpu)->vmcs01);
277 free_nested(vcpu);
278 vcpu_put(vcpu);
279}
280
281static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
282 struct x86_exception *fault)
283{
284 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
285 struct vcpu_vmx *vmx = to_vmx(vcpu);
286 u32 exit_reason;
287 unsigned long exit_qualification = vcpu->arch.exit_qualification;
288
289 if (vmx->nested.pml_full) {
290 exit_reason = EXIT_REASON_PML_FULL;
291 vmx->nested.pml_full = false;
292 exit_qualification &= INTR_INFO_UNBLOCK_NMI;
293 } else if (fault->error_code & PFERR_RSVD_MASK)
294 exit_reason = EXIT_REASON_EPT_MISCONFIG;
295 else
296 exit_reason = EXIT_REASON_EPT_VIOLATION;
297
298 nested_vmx_vmexit(vcpu, exit_reason, 0, exit_qualification);
299 vmcs12->guest_physical_address = fault->address;
300}
301
302static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
303{
304 WARN_ON(mmu_is_nested(vcpu));
305
306 vcpu->arch.mmu = &vcpu->arch.guest_mmu;
307 kvm_init_shadow_ept_mmu(vcpu,
308 to_vmx(vcpu)->nested.msrs.ept_caps &
309 VMX_EPT_EXECUTE_ONLY_BIT,
310 nested_ept_ad_enabled(vcpu),
311 nested_ept_get_cr3(vcpu));
312 vcpu->arch.mmu->set_cr3 = vmx_set_cr3;
313 vcpu->arch.mmu->get_cr3 = nested_ept_get_cr3;
314 vcpu->arch.mmu->inject_page_fault = nested_ept_inject_page_fault;
315 vcpu->arch.mmu->get_pdptr = kvm_pdptr_read;
316
317 vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu;
318}
319
320static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu)
321{
322 vcpu->arch.mmu = &vcpu->arch.root_mmu;
323 vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
324}
325
326static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
327 u16 error_code)
328{
329 bool inequality, bit;
330
331 bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0;
332 inequality =
333 (error_code & vmcs12->page_fault_error_code_mask) !=
334 vmcs12->page_fault_error_code_match;
335 return inequality ^ bit;
336}
337
338
339/*
340 * KVM wants to inject page-faults which it got to the guest. This function
341 * checks whether in a nested guest, we need to inject them to L1 or L2.
342 */
343static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit_qual)
344{
345 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
346 unsigned int nr = vcpu->arch.exception.nr;
347 bool has_payload = vcpu->arch.exception.has_payload;
348 unsigned long payload = vcpu->arch.exception.payload;
349
350 if (nr == PF_VECTOR) {
351 if (vcpu->arch.exception.nested_apf) {
352 *exit_qual = vcpu->arch.apf.nested_apf_token;
353 return 1;
354 }
355 if (nested_vmx_is_page_fault_vmexit(vmcs12,
356 vcpu->arch.exception.error_code)) {
357 *exit_qual = has_payload ? payload : vcpu->arch.cr2;
358 return 1;
359 }
360 } else if (vmcs12->exception_bitmap & (1u << nr)) {
361 if (nr == DB_VECTOR) {
362 if (!has_payload) {
363 payload = vcpu->arch.dr6;
364 payload &= ~(DR6_FIXED_1 | DR6_BT);
365 payload ^= DR6_RTM;
366 }
367 *exit_qual = payload;
368 } else
369 *exit_qual = 0;
370 return 1;
371 }
372
373 return 0;
374}
375
376
377static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
378 struct x86_exception *fault)
379{
380 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
381
382 WARN_ON(!is_guest_mode(vcpu));
383
384 if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code) &&
385 !to_vmx(vcpu)->nested.nested_run_pending) {
386 vmcs12->vm_exit_intr_error_code = fault->error_code;
387 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
388 PF_VECTOR | INTR_TYPE_HARD_EXCEPTION |
389 INTR_INFO_DELIVER_CODE_MASK | INTR_INFO_VALID_MASK,
390 fault->address);
391 } else {
392 kvm_inject_page_fault(vcpu, fault);
393 }
394}
395
396static bool page_address_valid(struct kvm_vcpu *vcpu, gpa_t gpa)
397{
398 return PAGE_ALIGNED(gpa) && !(gpa >> cpuid_maxphyaddr(vcpu));
399}
400
401static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu,
402 struct vmcs12 *vmcs12)
403{
404 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
405 return 0;
406
407 if (!page_address_valid(vcpu, vmcs12->io_bitmap_a) ||
408 !page_address_valid(vcpu, vmcs12->io_bitmap_b))
409 return -EINVAL;
410
411 return 0;
412}
413
414static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu,
415 struct vmcs12 *vmcs12)
416{
417 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
418 return 0;
419
420 if (!page_address_valid(vcpu, vmcs12->msr_bitmap))
421 return -EINVAL;
422
423 return 0;
424}
425
426static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu,
427 struct vmcs12 *vmcs12)
428{
429 if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
430 return 0;
431
432 if (!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr))
433 return -EINVAL;
434
435 return 0;
436}
437
438/*
439 * Check if MSR is intercepted for L01 MSR bitmap.
440 */
441static bool msr_write_intercepted_l01(struct kvm_vcpu *vcpu, u32 msr)
442{
443 unsigned long *msr_bitmap;
444 int f = sizeof(unsigned long);
445
446 if (!cpu_has_vmx_msr_bitmap())
447 return true;
448
449 msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap;
450
451 if (msr <= 0x1fff) {
452 return !!test_bit(msr, msr_bitmap + 0x800 / f);
453 } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
454 msr &= 0x1fff;
455 return !!test_bit(msr, msr_bitmap + 0xc00 / f);
456 }
457
458 return true;
459}
460
461/*
462 * If a msr is allowed by L0, we should check whether it is allowed by L1.
463 * The corresponding bit will be cleared unless both of L0 and L1 allow it.
464 */
465static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1,
466 unsigned long *msr_bitmap_nested,
467 u32 msr, int type)
468{
469 int f = sizeof(unsigned long);
470
471 /*
472 * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
473 * have the write-low and read-high bitmap offsets the wrong way round.
474 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
475 */
476 if (msr <= 0x1fff) {
477 if (type & MSR_TYPE_R &&
478 !test_bit(msr, msr_bitmap_l1 + 0x000 / f))
479 /* read-low */
480 __clear_bit(msr, msr_bitmap_nested + 0x000 / f);
481
482 if (type & MSR_TYPE_W &&
483 !test_bit(msr, msr_bitmap_l1 + 0x800 / f))
484 /* write-low */
485 __clear_bit(msr, msr_bitmap_nested + 0x800 / f);
486
487 } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
488 msr &= 0x1fff;
489 if (type & MSR_TYPE_R &&
490 !test_bit(msr, msr_bitmap_l1 + 0x400 / f))
491 /* read-high */
492 __clear_bit(msr, msr_bitmap_nested + 0x400 / f);
493
494 if (type & MSR_TYPE_W &&
495 !test_bit(msr, msr_bitmap_l1 + 0xc00 / f))
496 /* write-high */
497 __clear_bit(msr, msr_bitmap_nested + 0xc00 / f);
498
499 }
500}
501
502/*
503 * Merge L0's and L1's MSR bitmap, return false to indicate that
504 * we do not use the hardware.
505 */
506static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
507 struct vmcs12 *vmcs12)
508{
509 int msr;
510 struct page *page;
511 unsigned long *msr_bitmap_l1;
512 unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap;
513 /*
514 * pred_cmd & spec_ctrl are trying to verify two things:
515 *
516 * 1. L0 gave a permission to L1 to actually passthrough the MSR. This
517 * ensures that we do not accidentally generate an L02 MSR bitmap
518 * from the L12 MSR bitmap that is too permissive.
519 * 2. That L1 or L2s have actually used the MSR. This avoids
520 * unnecessarily merging of the bitmap if the MSR is unused. This
521 * works properly because we only update the L01 MSR bitmap lazily.
522 * So even if L0 should pass L1 these MSRs, the L01 bitmap is only
523 * updated to reflect this when L1 (or its L2s) actually write to
524 * the MSR.
525 */
526 bool pred_cmd = !msr_write_intercepted_l01(vcpu, MSR_IA32_PRED_CMD);
527 bool spec_ctrl = !msr_write_intercepted_l01(vcpu, MSR_IA32_SPEC_CTRL);
528
529 /* Nothing to do if the MSR bitmap is not in use. */
530 if (!cpu_has_vmx_msr_bitmap() ||
531 !nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
532 return false;
533
534 if (!nested_cpu_has_virt_x2apic_mode(vmcs12) &&
535 !pred_cmd && !spec_ctrl)
536 return false;
537
538 page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->msr_bitmap);
539 if (is_error_page(page))
540 return false;
541
542 msr_bitmap_l1 = (unsigned long *)kmap(page);
543 if (nested_cpu_has_apic_reg_virt(vmcs12)) {
544 /*
545 * L0 need not intercept reads for MSRs between 0x800 and 0x8ff, it
546 * just lets the processor take the value from the virtual-APIC page;
547 * take those 256 bits directly from the L1 bitmap.
548 */
549 for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
550 unsigned word = msr / BITS_PER_LONG;
551 msr_bitmap_l0[word] = msr_bitmap_l1[word];
552 msr_bitmap_l0[word + (0x800 / sizeof(long))] = ~0;
553 }
554 } else {
555 for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
556 unsigned word = msr / BITS_PER_LONG;
557 msr_bitmap_l0[word] = ~0;
558 msr_bitmap_l0[word + (0x800 / sizeof(long))] = ~0;
559 }
560 }
561
562 nested_vmx_disable_intercept_for_msr(
563 msr_bitmap_l1, msr_bitmap_l0,
564 X2APIC_MSR(APIC_TASKPRI),
565 MSR_TYPE_W);
566
567 if (nested_cpu_has_vid(vmcs12)) {
568 nested_vmx_disable_intercept_for_msr(
569 msr_bitmap_l1, msr_bitmap_l0,
570 X2APIC_MSR(APIC_EOI),
571 MSR_TYPE_W);
572 nested_vmx_disable_intercept_for_msr(
573 msr_bitmap_l1, msr_bitmap_l0,
574 X2APIC_MSR(APIC_SELF_IPI),
575 MSR_TYPE_W);
576 }
577
578 if (spec_ctrl)
579 nested_vmx_disable_intercept_for_msr(
580 msr_bitmap_l1, msr_bitmap_l0,
581 MSR_IA32_SPEC_CTRL,
582 MSR_TYPE_R | MSR_TYPE_W);
583
584 if (pred_cmd)
585 nested_vmx_disable_intercept_for_msr(
586 msr_bitmap_l1, msr_bitmap_l0,
587 MSR_IA32_PRED_CMD,
588 MSR_TYPE_W);
589
590 kunmap(page);
591 kvm_release_page_clean(page);
592
593 return true;
594}
595
596static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu,
597 struct vmcs12 *vmcs12)
598{
599 struct vmcs12 *shadow;
600 struct page *page;
601
602 if (!nested_cpu_has_shadow_vmcs(vmcs12) ||
603 vmcs12->vmcs_link_pointer == -1ull)
604 return;
605
606 shadow = get_shadow_vmcs12(vcpu);
607 page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->vmcs_link_pointer);
608
609 memcpy(shadow, kmap(page), VMCS12_SIZE);
610
611 kunmap(page);
612 kvm_release_page_clean(page);
613}
614
615static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu,
616 struct vmcs12 *vmcs12)
617{
618 struct vcpu_vmx *vmx = to_vmx(vcpu);
619
620 if (!nested_cpu_has_shadow_vmcs(vmcs12) ||
621 vmcs12->vmcs_link_pointer == -1ull)
622 return;
623
624 kvm_write_guest(vmx->vcpu.kvm, vmcs12->vmcs_link_pointer,
625 get_shadow_vmcs12(vcpu), VMCS12_SIZE);
626}
627
628/*
629 * In nested virtualization, check if L1 has set
630 * VM_EXIT_ACK_INTR_ON_EXIT
631 */
632static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu)
633{
634 return get_vmcs12(vcpu)->vm_exit_controls &
635 VM_EXIT_ACK_INTR_ON_EXIT;
636}
637
638static bool nested_exit_on_nmi(struct kvm_vcpu *vcpu)
639{
640 return nested_cpu_has_nmi_exiting(get_vmcs12(vcpu));
641}
642
643static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu,
644 struct vmcs12 *vmcs12)
645{
646 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) &&
647 !page_address_valid(vcpu, vmcs12->apic_access_addr))
648 return -EINVAL;
649 else
650 return 0;
651}
652
653static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
654 struct vmcs12 *vmcs12)
655{
656 if (!nested_cpu_has_virt_x2apic_mode(vmcs12) &&
657 !nested_cpu_has_apic_reg_virt(vmcs12) &&
658 !nested_cpu_has_vid(vmcs12) &&
659 !nested_cpu_has_posted_intr(vmcs12))
660 return 0;
661
662 /*
663 * If virtualize x2apic mode is enabled,
664 * virtualize apic access must be disabled.
665 */
666 if (nested_cpu_has_virt_x2apic_mode(vmcs12) &&
667 nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
668 return -EINVAL;
669
670 /*
671 * If virtual interrupt delivery is enabled,
672 * we must exit on external interrupts.
673 */
674 if (nested_cpu_has_vid(vmcs12) &&
675 !nested_exit_on_intr(vcpu))
676 return -EINVAL;
677
678 /*
679 * bits 15:8 should be zero in posted_intr_nv,
680 * the descriptor address has been already checked
681 * in nested_get_vmcs12_pages.
682 *
683 * bits 5:0 of posted_intr_desc_addr should be zero.
684 */
685 if (nested_cpu_has_posted_intr(vmcs12) &&
686 (!nested_cpu_has_vid(vmcs12) ||
687 !nested_exit_intr_ack_set(vcpu) ||
688 (vmcs12->posted_intr_nv & 0xff00) ||
689 (vmcs12->posted_intr_desc_addr & 0x3f) ||
690 (vmcs12->posted_intr_desc_addr >> cpuid_maxphyaddr(vcpu))))
691 return -EINVAL;
692
693 /* tpr shadow is needed by all apicv features. */
694 if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
695 return -EINVAL;
696
697 return 0;
698}
699
700static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu,
701 u32 count, u64 addr)
702{
703 int maxphyaddr;
704
705 if (count == 0)
706 return 0;
707 maxphyaddr = cpuid_maxphyaddr(vcpu);
708 if (!IS_ALIGNED(addr, 16) || addr >> maxphyaddr ||
709 (addr + count * sizeof(struct vmx_msr_entry) - 1) >> maxphyaddr)
710 return -EINVAL;
711
712 return 0;
713}
714
715static int nested_vmx_check_exit_msr_switch_controls(struct kvm_vcpu *vcpu,
716 struct vmcs12 *vmcs12)
717{
718 if (nested_vmx_check_msr_switch(vcpu, vmcs12->vm_exit_msr_load_count,
719 vmcs12->vm_exit_msr_load_addr) ||
720 nested_vmx_check_msr_switch(vcpu, vmcs12->vm_exit_msr_store_count,
721 vmcs12->vm_exit_msr_store_addr))
722 return -EINVAL;
723
724 return 0;
725}
726
727static int nested_vmx_check_entry_msr_switch_controls(struct kvm_vcpu *vcpu,
728 struct vmcs12 *vmcs12)
729{
730 if (nested_vmx_check_msr_switch(vcpu, vmcs12->vm_entry_msr_load_count,
731 vmcs12->vm_entry_msr_load_addr))
732 return -EINVAL;
733
734 return 0;
735}
736
737static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu,
738 struct vmcs12 *vmcs12)
739{
740 if (!nested_cpu_has_pml(vmcs12))
741 return 0;
742
743 if (!nested_cpu_has_ept(vmcs12) ||
744 !page_address_valid(vcpu, vmcs12->pml_address))
745 return -EINVAL;
746
747 return 0;
748}
749
750static int nested_vmx_check_unrestricted_guest_controls(struct kvm_vcpu *vcpu,
751 struct vmcs12 *vmcs12)
752{
753 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST) &&
754 !nested_cpu_has_ept(vmcs12))
755 return -EINVAL;
756 return 0;
757}
758
759static int nested_vmx_check_mode_based_ept_exec_controls(struct kvm_vcpu *vcpu,
760 struct vmcs12 *vmcs12)
761{
762 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_MODE_BASED_EPT_EXEC) &&
763 !nested_cpu_has_ept(vmcs12))
764 return -EINVAL;
765 return 0;
766}
767
768static int nested_vmx_check_shadow_vmcs_controls(struct kvm_vcpu *vcpu,
769 struct vmcs12 *vmcs12)
770{
771 if (!nested_cpu_has_shadow_vmcs(vmcs12))
772 return 0;
773
774 if (!page_address_valid(vcpu, vmcs12->vmread_bitmap) ||
775 !page_address_valid(vcpu, vmcs12->vmwrite_bitmap))
776 return -EINVAL;
777
778 return 0;
779}
780
781static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu,
782 struct vmx_msr_entry *e)
783{
784 /* x2APIC MSR accesses are not allowed */
785 if (vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8)
786 return -EINVAL;
787 if (e->index == MSR_IA32_UCODE_WRITE || /* SDM Table 35-2 */
788 e->index == MSR_IA32_UCODE_REV)
789 return -EINVAL;
790 if (e->reserved != 0)
791 return -EINVAL;
792 return 0;
793}
794
795static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu,
796 struct vmx_msr_entry *e)
797{
798 if (e->index == MSR_FS_BASE ||
799 e->index == MSR_GS_BASE ||
800 e->index == MSR_IA32_SMM_MONITOR_CTL || /* SMM is not supported */
801 nested_vmx_msr_check_common(vcpu, e))
802 return -EINVAL;
803 return 0;
804}
805
806static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu,
807 struct vmx_msr_entry *e)
808{
809 if (e->index == MSR_IA32_SMBASE || /* SMM is not supported */
810 nested_vmx_msr_check_common(vcpu, e))
811 return -EINVAL;
812 return 0;
813}
814
815/*
816 * Load guest's/host's msr at nested entry/exit.
817 * return 0 for success, entry index for failure.
818 */
819static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
820{
821 u32 i;
822 struct vmx_msr_entry e;
823 struct msr_data msr;
824
825 msr.host_initiated = false;
826 for (i = 0; i < count; i++) {
827 if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e),
828 &e, sizeof(e))) {
829 pr_debug_ratelimited(
830 "%s cannot read MSR entry (%u, 0x%08llx)\n",
831 __func__, i, gpa + i * sizeof(e));
832 goto fail;
833 }
834 if (nested_vmx_load_msr_check(vcpu, &e)) {
835 pr_debug_ratelimited(
836 "%s check failed (%u, 0x%x, 0x%x)\n",
837 __func__, i, e.index, e.reserved);
838 goto fail;
839 }
840 msr.index = e.index;
841 msr.data = e.value;
842 if (kvm_set_msr(vcpu, &msr)) {
843 pr_debug_ratelimited(
844 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
845 __func__, i, e.index, e.value);
846 goto fail;
847 }
848 }
849 return 0;
850fail:
851 return i + 1;
852}
853
854static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
855{
856 u32 i;
857 struct vmx_msr_entry e;
858
859 for (i = 0; i < count; i++) {
860 struct msr_data msr_info;
861 if (kvm_vcpu_read_guest(vcpu,
862 gpa + i * sizeof(e),
863 &e, 2 * sizeof(u32))) {
864 pr_debug_ratelimited(
865 "%s cannot read MSR entry (%u, 0x%08llx)\n",
866 __func__, i, gpa + i * sizeof(e));
867 return -EINVAL;
868 }
869 if (nested_vmx_store_msr_check(vcpu, &e)) {
870 pr_debug_ratelimited(
871 "%s check failed (%u, 0x%x, 0x%x)\n",
872 __func__, i, e.index, e.reserved);
873 return -EINVAL;
874 }
875 msr_info.host_initiated = false;
876 msr_info.index = e.index;
877 if (kvm_get_msr(vcpu, &msr_info)) {
878 pr_debug_ratelimited(
879 "%s cannot read MSR (%u, 0x%x)\n",
880 __func__, i, e.index);
881 return -EINVAL;
882 }
883 if (kvm_vcpu_write_guest(vcpu,
884 gpa + i * sizeof(e) +
885 offsetof(struct vmx_msr_entry, value),
886 &msr_info.data, sizeof(msr_info.data))) {
887 pr_debug_ratelimited(
888 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
889 __func__, i, e.index, msr_info.data);
890 return -EINVAL;
891 }
892 }
893 return 0;
894}
895
896static bool nested_cr3_valid(struct kvm_vcpu *vcpu, unsigned long val)
897{
898 unsigned long invalid_mask;
899
900 invalid_mask = (~0ULL) << cpuid_maxphyaddr(vcpu);
901 return (val & invalid_mask) == 0;
902}
903
904/*
905 * Load guest's/host's cr3 at nested entry/exit. nested_ept is true if we are
906 * emulating VM entry into a guest with EPT enabled.
907 * Returns 0 on success, 1 on failure. Invalid state exit qualification code
908 * is assigned to entry_failure_code on failure.
909 */
910static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool nested_ept,
911 u32 *entry_failure_code)
912{
913 if (cr3 != kvm_read_cr3(vcpu) || (!nested_ept && pdptrs_changed(vcpu))) {
914 if (!nested_cr3_valid(vcpu, cr3)) {
915 *entry_failure_code = ENTRY_FAIL_DEFAULT;
916 return 1;
917 }
918
919 /*
920 * If PAE paging and EPT are both on, CR3 is not used by the CPU and
921 * must not be dereferenced.
922 */
923 if (!is_long_mode(vcpu) && is_pae(vcpu) && is_paging(vcpu) &&
924 !nested_ept) {
925 if (!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) {
926 *entry_failure_code = ENTRY_FAIL_PDPTE;
927 return 1;
928 }
929 }
930 }
931
932 if (!nested_ept)
933 kvm_mmu_new_cr3(vcpu, cr3, false);
934
935 vcpu->arch.cr3 = cr3;
936 __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
937
938 kvm_init_mmu(vcpu, false);
939
940 return 0;
941}
942
943/*
944 * Returns if KVM is able to config CPU to tag TLB entries
945 * populated by L2 differently than TLB entries populated
946 * by L1.
947 *
948 * If L1 uses EPT, then TLB entries are tagged with different EPTP.
949 *
950 * If L1 uses VPID and we allocated a vpid02, TLB entries are tagged
951 * with different VPID (L1 entries are tagged with vmx->vpid
952 * while L2 entries are tagged with vmx->nested.vpid02).
953 */
954static bool nested_has_guest_tlb_tag(struct kvm_vcpu *vcpu)
955{
956 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
957
958 return nested_cpu_has_ept(vmcs12) ||
959 (nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02);
960}
961
962static u16 nested_get_vpid02(struct kvm_vcpu *vcpu)
963{
964 struct vcpu_vmx *vmx = to_vmx(vcpu);
965
966 return vmx->nested.vpid02 ? vmx->nested.vpid02 : vmx->vpid;
967}
968
969
970static inline bool vmx_control_verify(u32 control, u32 low, u32 high)
971{
972 return fixed_bits_valid(control, low, high);
973}
974
975static inline u64 vmx_control_msr(u32 low, u32 high)
976{
977 return low | ((u64)high << 32);
978}
979
980static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask)
981{
982 superset &= mask;
983 subset &= mask;
984
985 return (superset | subset) == superset;
986}
987
988static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data)
989{
990 const u64 feature_and_reserved =
991 /* feature (except bit 48; see below) */
992 BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) |
993 /* reserved */
994 BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56);
995 u64 vmx_basic = vmx->nested.msrs.basic;
996
997 if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved))
998 return -EINVAL;
999
1000 /*
1001 * KVM does not emulate a version of VMX that constrains physical
1002 * addresses of VMX structures (e.g. VMCS) to 32-bits.
1003 */
1004 if (data & BIT_ULL(48))
1005 return -EINVAL;
1006
1007 if (vmx_basic_vmcs_revision_id(vmx_basic) !=
1008 vmx_basic_vmcs_revision_id(data))
1009 return -EINVAL;
1010
1011 if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data))
1012 return -EINVAL;
1013
1014 vmx->nested.msrs.basic = data;
1015 return 0;
1016}
1017
1018static int
1019vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
1020{
1021 u64 supported;
1022 u32 *lowp, *highp;
1023
1024 switch (msr_index) {
1025 case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
1026 lowp = &vmx->nested.msrs.pinbased_ctls_low;
1027 highp = &vmx->nested.msrs.pinbased_ctls_high;
1028 break;
1029 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
1030 lowp = &vmx->nested.msrs.procbased_ctls_low;
1031 highp = &vmx->nested.msrs.procbased_ctls_high;
1032 break;
1033 case MSR_IA32_VMX_TRUE_EXIT_CTLS:
1034 lowp = &vmx->nested.msrs.exit_ctls_low;
1035 highp = &vmx->nested.msrs.exit_ctls_high;
1036 break;
1037 case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
1038 lowp = &vmx->nested.msrs.entry_ctls_low;
1039 highp = &vmx->nested.msrs.entry_ctls_high;
1040 break;
1041 case MSR_IA32_VMX_PROCBASED_CTLS2:
1042 lowp = &vmx->nested.msrs.secondary_ctls_low;
1043 highp = &vmx->nested.msrs.secondary_ctls_high;
1044 break;
1045 default:
1046 BUG();
1047 }
1048
1049 supported = vmx_control_msr(*lowp, *highp);
1050
1051 /* Check must-be-1 bits are still 1. */
1052 if (!is_bitwise_subset(data, supported, GENMASK_ULL(31, 0)))
1053 return -EINVAL;
1054
1055 /* Check must-be-0 bits are still 0. */
1056 if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32)))
1057 return -EINVAL;
1058
1059 *lowp = data;
1060 *highp = data >> 32;
1061 return 0;
1062}
1063
1064static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data)
1065{
1066 const u64 feature_and_reserved_bits =
1067 /* feature */
1068 BIT_ULL(5) | GENMASK_ULL(8, 6) | BIT_ULL(14) | BIT_ULL(15) |
1069 BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) |
1070 /* reserved */
1071 GENMASK_ULL(13, 9) | BIT_ULL(31);
1072 u64 vmx_misc;
1073
1074 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low,
1075 vmx->nested.msrs.misc_high);
1076
1077 if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits))
1078 return -EINVAL;
1079
1080 if ((vmx->nested.msrs.pinbased_ctls_high &
1081 PIN_BASED_VMX_PREEMPTION_TIMER) &&
1082 vmx_misc_preemption_timer_rate(data) !=
1083 vmx_misc_preemption_timer_rate(vmx_misc))
1084 return -EINVAL;
1085
1086 if (vmx_misc_cr3_count(data) > vmx_misc_cr3_count(vmx_misc))
1087 return -EINVAL;
1088
1089 if (vmx_misc_max_msr(data) > vmx_misc_max_msr(vmx_misc))
1090 return -EINVAL;
1091
1092 if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc))
1093 return -EINVAL;
1094
1095 vmx->nested.msrs.misc_low = data;
1096 vmx->nested.msrs.misc_high = data >> 32;
1097
1098 /*
1099 * If L1 has read-only VM-exit information fields, use the
1100 * less permissive vmx_vmwrite_bitmap to specify write
1101 * permissions for the shadow VMCS.
1102 */
1103 if (enable_shadow_vmcs && !nested_cpu_has_vmwrite_any_field(&vmx->vcpu))
1104 vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap));
1105
1106 return 0;
1107}
1108
1109static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data)
1110{
1111 u64 vmx_ept_vpid_cap;
1112
1113 vmx_ept_vpid_cap = vmx_control_msr(vmx->nested.msrs.ept_caps,
1114 vmx->nested.msrs.vpid_caps);
1115
1116 /* Every bit is either reserved or a feature bit. */
1117 if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL))
1118 return -EINVAL;
1119
1120 vmx->nested.msrs.ept_caps = data;
1121 vmx->nested.msrs.vpid_caps = data >> 32;
1122 return 0;
1123}
1124
1125static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
1126{
1127 u64 *msr;
1128
1129 switch (msr_index) {
1130 case MSR_IA32_VMX_CR0_FIXED0:
1131 msr = &vmx->nested.msrs.cr0_fixed0;
1132 break;
1133 case MSR_IA32_VMX_CR4_FIXED0:
1134 msr = &vmx->nested.msrs.cr4_fixed0;
1135 break;
1136 default:
1137 BUG();
1138 }
1139
1140 /*
1141 * 1 bits (which indicates bits which "must-be-1" during VMX operation)
1142 * must be 1 in the restored value.
1143 */
1144 if (!is_bitwise_subset(data, *msr, -1ULL))
1145 return -EINVAL;
1146
1147 *msr = data;
1148 return 0;
1149}
1150
1151/*
1152 * Called when userspace is restoring VMX MSRs.
1153 *
1154 * Returns 0 on success, non-0 otherwise.
1155 */
1156int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
1157{
1158 struct vcpu_vmx *vmx = to_vmx(vcpu);
1159
1160 /*
1161 * Don't allow changes to the VMX capability MSRs while the vCPU
1162 * is in VMX operation.
1163 */
1164 if (vmx->nested.vmxon)
1165 return -EBUSY;
1166
1167 switch (msr_index) {
1168 case MSR_IA32_VMX_BASIC:
1169 return vmx_restore_vmx_basic(vmx, data);
1170 case MSR_IA32_VMX_PINBASED_CTLS:
1171 case MSR_IA32_VMX_PROCBASED_CTLS:
1172 case MSR_IA32_VMX_EXIT_CTLS:
1173 case MSR_IA32_VMX_ENTRY_CTLS:
1174 /*
1175 * The "non-true" VMX capability MSRs are generated from the
1176 * "true" MSRs, so we do not support restoring them directly.
1177 *
1178 * If userspace wants to emulate VMX_BASIC[55]=0, userspace
1179 * should restore the "true" MSRs with the must-be-1 bits
1180 * set according to the SDM Vol 3. A.2 "RESERVED CONTROLS AND
1181 * DEFAULT SETTINGS".
1182 */
1183 return -EINVAL;
1184 case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
1185 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
1186 case MSR_IA32_VMX_TRUE_EXIT_CTLS:
1187 case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
1188 case MSR_IA32_VMX_PROCBASED_CTLS2:
1189 return vmx_restore_control_msr(vmx, msr_index, data);
1190 case MSR_IA32_VMX_MISC:
1191 return vmx_restore_vmx_misc(vmx, data);
1192 case MSR_IA32_VMX_CR0_FIXED0:
1193 case MSR_IA32_VMX_CR4_FIXED0:
1194 return vmx_restore_fixed0_msr(vmx, msr_index, data);
1195 case MSR_IA32_VMX_CR0_FIXED1:
1196 case MSR_IA32_VMX_CR4_FIXED1:
1197 /*
1198 * These MSRs are generated based on the vCPU's CPUID, so we
1199 * do not support restoring them directly.
1200 */
1201 return -EINVAL;
1202 case MSR_IA32_VMX_EPT_VPID_CAP:
1203 return vmx_restore_vmx_ept_vpid_cap(vmx, data);
1204 case MSR_IA32_VMX_VMCS_ENUM:
1205 vmx->nested.msrs.vmcs_enum = data;
1206 return 0;
1207 default:
1208 /*
1209 * The rest of the VMX capability MSRs do not support restore.
1210 */
1211 return -EINVAL;
1212 }
1213}
1214
1215/* Returns 0 on success, non-0 otherwise. */
1216int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata)
1217{
1218 switch (msr_index) {
1219 case MSR_IA32_VMX_BASIC:
1220 *pdata = msrs->basic;
1221 break;
1222 case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
1223 case MSR_IA32_VMX_PINBASED_CTLS:
1224 *pdata = vmx_control_msr(
1225 msrs->pinbased_ctls_low,
1226 msrs->pinbased_ctls_high);
1227 if (msr_index == MSR_IA32_VMX_PINBASED_CTLS)
1228 *pdata |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
1229 break;
1230 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
1231 case MSR_IA32_VMX_PROCBASED_CTLS:
1232 *pdata = vmx_control_msr(
1233 msrs->procbased_ctls_low,
1234 msrs->procbased_ctls_high);
1235 if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS)
1236 *pdata |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
1237 break;
1238 case MSR_IA32_VMX_TRUE_EXIT_CTLS:
1239 case MSR_IA32_VMX_EXIT_CTLS:
1240 *pdata = vmx_control_msr(
1241 msrs->exit_ctls_low,
1242 msrs->exit_ctls_high);
1243 if (msr_index == MSR_IA32_VMX_EXIT_CTLS)
1244 *pdata |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
1245 break;
1246 case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
1247 case MSR_IA32_VMX_ENTRY_CTLS:
1248 *pdata = vmx_control_msr(
1249 msrs->entry_ctls_low,
1250 msrs->entry_ctls_high);
1251 if (msr_index == MSR_IA32_VMX_ENTRY_CTLS)
1252 *pdata |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
1253 break;
1254 case MSR_IA32_VMX_MISC:
1255 *pdata = vmx_control_msr(
1256 msrs->misc_low,
1257 msrs->misc_high);
1258 break;
1259 case MSR_IA32_VMX_CR0_FIXED0:
1260 *pdata = msrs->cr0_fixed0;
1261 break;
1262 case MSR_IA32_VMX_CR0_FIXED1:
1263 *pdata = msrs->cr0_fixed1;
1264 break;
1265 case MSR_IA32_VMX_CR4_FIXED0:
1266 *pdata = msrs->cr4_fixed0;
1267 break;
1268 case MSR_IA32_VMX_CR4_FIXED1:
1269 *pdata = msrs->cr4_fixed1;
1270 break;
1271 case MSR_IA32_VMX_VMCS_ENUM:
1272 *pdata = msrs->vmcs_enum;
1273 break;
1274 case MSR_IA32_VMX_PROCBASED_CTLS2:
1275 *pdata = vmx_control_msr(
1276 msrs->secondary_ctls_low,
1277 msrs->secondary_ctls_high);
1278 break;
1279 case MSR_IA32_VMX_EPT_VPID_CAP:
1280 *pdata = msrs->ept_caps |
1281 ((u64)msrs->vpid_caps << 32);
1282 break;
1283 case MSR_IA32_VMX_VMFUNC:
1284 *pdata = msrs->vmfunc_controls;
1285 break;
1286 default:
1287 return 1;
1288 }
1289
1290 return 0;
1291}
1292
1293/*
1294 * Copy the writable VMCS shadow fields back to the VMCS12, in case
1295 * they have been modified by the L1 guest. Note that the "read-only"
1296 * VM-exit information fields are actually writable if the vCPU is
1297 * configured to support "VMWRITE to any supported field in the VMCS."
1298 */
1299static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)
1300{
1301 const u16 *fields[] = {
1302 shadow_read_write_fields,
1303 shadow_read_only_fields
1304 };
1305 const int max_fields[] = {
1306 max_shadow_read_write_fields,
1307 max_shadow_read_only_fields
1308 };
1309 int i, q;
1310 unsigned long field;
1311 u64 field_value;
1312 struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs;
1313
1314 preempt_disable();
1315
1316 vmcs_load(shadow_vmcs);
1317
1318 for (q = 0; q < ARRAY_SIZE(fields); q++) {
1319 for (i = 0; i < max_fields[q]; i++) {
1320 field = fields[q][i];
1321 field_value = __vmcs_readl(field);
1322 vmcs12_write_any(get_vmcs12(&vmx->vcpu), field, field_value);
1323 }
1324 /*
1325 * Skip the VM-exit information fields if they are read-only.
1326 */
1327 if (!nested_cpu_has_vmwrite_any_field(&vmx->vcpu))
1328 break;
1329 }
1330
1331 vmcs_clear(shadow_vmcs);
1332 vmcs_load(vmx->loaded_vmcs->vmcs);
1333
1334 preempt_enable();
1335}
1336
1337static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
1338{
1339 const u16 *fields[] = {
1340 shadow_read_write_fields,
1341 shadow_read_only_fields
1342 };
1343 const int max_fields[] = {
1344 max_shadow_read_write_fields,
1345 max_shadow_read_only_fields
1346 };
1347 int i, q;
1348 unsigned long field;
1349 u64 field_value = 0;
1350 struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs;
1351
1352 vmcs_load(shadow_vmcs);
1353
1354 for (q = 0; q < ARRAY_SIZE(fields); q++) {
1355 for (i = 0; i < max_fields[q]; i++) {
1356 field = fields[q][i];
1357 vmcs12_read_any(get_vmcs12(&vmx->vcpu), field, &field_value);
1358 __vmcs_writel(field, field_value);
1359 }
1360 }
1361
1362 vmcs_clear(shadow_vmcs);
1363 vmcs_load(vmx->loaded_vmcs->vmcs);
1364}
1365
1366static int copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx)
1367{
1368 struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12;
1369 struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs;
1370
1371 /* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */
1372 vmcs12->tpr_threshold = evmcs->tpr_threshold;
1373 vmcs12->guest_rip = evmcs->guest_rip;
1374
1375 if (unlikely(!(evmcs->hv_clean_fields &
1376 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC))) {
1377 vmcs12->guest_rsp = evmcs->guest_rsp;
1378 vmcs12->guest_rflags = evmcs->guest_rflags;
1379 vmcs12->guest_interruptibility_info =
1380 evmcs->guest_interruptibility_info;
1381 }
1382
1383 if (unlikely(!(evmcs->hv_clean_fields &
1384 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) {
1385 vmcs12->cpu_based_vm_exec_control =
1386 evmcs->cpu_based_vm_exec_control;
1387 }
1388
1389 if (unlikely(!(evmcs->hv_clean_fields &
1390 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) {
1391 vmcs12->exception_bitmap = evmcs->exception_bitmap;
1392 }
1393
1394 if (unlikely(!(evmcs->hv_clean_fields &
1395 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY))) {
1396 vmcs12->vm_entry_controls = evmcs->vm_entry_controls;
1397 }
1398
1399 if (unlikely(!(evmcs->hv_clean_fields &
1400 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT))) {
1401 vmcs12->vm_entry_intr_info_field =
1402 evmcs->vm_entry_intr_info_field;
1403 vmcs12->vm_entry_exception_error_code =
1404 evmcs->vm_entry_exception_error_code;
1405 vmcs12->vm_entry_instruction_len =
1406 evmcs->vm_entry_instruction_len;
1407 }
1408
1409 if (unlikely(!(evmcs->hv_clean_fields &
1410 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) {
1411 vmcs12->host_ia32_pat = evmcs->host_ia32_pat;
1412 vmcs12->host_ia32_efer = evmcs->host_ia32_efer;
1413 vmcs12->host_cr0 = evmcs->host_cr0;
1414 vmcs12->host_cr3 = evmcs->host_cr3;
1415 vmcs12->host_cr4 = evmcs->host_cr4;
1416 vmcs12->host_ia32_sysenter_esp = evmcs->host_ia32_sysenter_esp;
1417 vmcs12->host_ia32_sysenter_eip = evmcs->host_ia32_sysenter_eip;
1418 vmcs12->host_rip = evmcs->host_rip;
1419 vmcs12->host_ia32_sysenter_cs = evmcs->host_ia32_sysenter_cs;
1420 vmcs12->host_es_selector = evmcs->host_es_selector;
1421 vmcs12->host_cs_selector = evmcs->host_cs_selector;
1422 vmcs12->host_ss_selector = evmcs->host_ss_selector;
1423 vmcs12->host_ds_selector = evmcs->host_ds_selector;
1424 vmcs12->host_fs_selector = evmcs->host_fs_selector;
1425 vmcs12->host_gs_selector = evmcs->host_gs_selector;
1426 vmcs12->host_tr_selector = evmcs->host_tr_selector;
1427 }
1428
1429 if (unlikely(!(evmcs->hv_clean_fields &
1430 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) {
1431 vmcs12->pin_based_vm_exec_control =
1432 evmcs->pin_based_vm_exec_control;
1433 vmcs12->vm_exit_controls = evmcs->vm_exit_controls;
1434 vmcs12->secondary_vm_exec_control =
1435 evmcs->secondary_vm_exec_control;
1436 }
1437
1438 if (unlikely(!(evmcs->hv_clean_fields &
1439 HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP))) {
1440 vmcs12->io_bitmap_a = evmcs->io_bitmap_a;
1441 vmcs12->io_bitmap_b = evmcs->io_bitmap_b;
1442 }
1443
1444 if (unlikely(!(evmcs->hv_clean_fields &
1445 HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP))) {
1446 vmcs12->msr_bitmap = evmcs->msr_bitmap;
1447 }
1448
1449 if (unlikely(!(evmcs->hv_clean_fields &
1450 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2))) {
1451 vmcs12->guest_es_base = evmcs->guest_es_base;
1452 vmcs12->guest_cs_base = evmcs->guest_cs_base;
1453 vmcs12->guest_ss_base = evmcs->guest_ss_base;
1454 vmcs12->guest_ds_base = evmcs->guest_ds_base;
1455 vmcs12->guest_fs_base = evmcs->guest_fs_base;
1456 vmcs12->guest_gs_base = evmcs->guest_gs_base;
1457 vmcs12->guest_ldtr_base = evmcs->guest_ldtr_base;
1458 vmcs12->guest_tr_base = evmcs->guest_tr_base;
1459 vmcs12->guest_gdtr_base = evmcs->guest_gdtr_base;
1460 vmcs12->guest_idtr_base = evmcs->guest_idtr_base;
1461 vmcs12->guest_es_limit = evmcs->guest_es_limit;
1462 vmcs12->guest_cs_limit = evmcs->guest_cs_limit;
1463 vmcs12->guest_ss_limit = evmcs->guest_ss_limit;
1464 vmcs12->guest_ds_limit = evmcs->guest_ds_limit;
1465 vmcs12->guest_fs_limit = evmcs->guest_fs_limit;
1466 vmcs12->guest_gs_limit = evmcs->guest_gs_limit;
1467 vmcs12->guest_ldtr_limit = evmcs->guest_ldtr_limit;
1468 vmcs12->guest_tr_limit = evmcs->guest_tr_limit;
1469 vmcs12->guest_gdtr_limit = evmcs->guest_gdtr_limit;
1470 vmcs12->guest_idtr_limit = evmcs->guest_idtr_limit;
1471 vmcs12->guest_es_ar_bytes = evmcs->guest_es_ar_bytes;
1472 vmcs12->guest_cs_ar_bytes = evmcs->guest_cs_ar_bytes;
1473 vmcs12->guest_ss_ar_bytes = evmcs->guest_ss_ar_bytes;
1474 vmcs12->guest_ds_ar_bytes = evmcs->guest_ds_ar_bytes;
1475 vmcs12->guest_fs_ar_bytes = evmcs->guest_fs_ar_bytes;
1476 vmcs12->guest_gs_ar_bytes = evmcs->guest_gs_ar_bytes;
1477 vmcs12->guest_ldtr_ar_bytes = evmcs->guest_ldtr_ar_bytes;
1478 vmcs12->guest_tr_ar_bytes = evmcs->guest_tr_ar_bytes;
1479 vmcs12->guest_es_selector = evmcs->guest_es_selector;
1480 vmcs12->guest_cs_selector = evmcs->guest_cs_selector;
1481 vmcs12->guest_ss_selector = evmcs->guest_ss_selector;
1482 vmcs12->guest_ds_selector = evmcs->guest_ds_selector;
1483 vmcs12->guest_fs_selector = evmcs->guest_fs_selector;
1484 vmcs12->guest_gs_selector = evmcs->guest_gs_selector;
1485 vmcs12->guest_ldtr_selector = evmcs->guest_ldtr_selector;
1486 vmcs12->guest_tr_selector = evmcs->guest_tr_selector;
1487 }
1488
1489 if (unlikely(!(evmcs->hv_clean_fields &
1490 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2))) {
1491 vmcs12->tsc_offset = evmcs->tsc_offset;
1492 vmcs12->virtual_apic_page_addr = evmcs->virtual_apic_page_addr;
1493 vmcs12->xss_exit_bitmap = evmcs->xss_exit_bitmap;
1494 }
1495
1496 if (unlikely(!(evmcs->hv_clean_fields &
1497 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR))) {
1498 vmcs12->cr0_guest_host_mask = evmcs->cr0_guest_host_mask;
1499 vmcs12->cr4_guest_host_mask = evmcs->cr4_guest_host_mask;
1500 vmcs12->cr0_read_shadow = evmcs->cr0_read_shadow;
1501 vmcs12->cr4_read_shadow = evmcs->cr4_read_shadow;
1502 vmcs12->guest_cr0 = evmcs->guest_cr0;
1503 vmcs12->guest_cr3 = evmcs->guest_cr3;
1504 vmcs12->guest_cr4 = evmcs->guest_cr4;
1505 vmcs12->guest_dr7 = evmcs->guest_dr7;
1506 }
1507
1508 if (unlikely(!(evmcs->hv_clean_fields &
1509 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER))) {
1510 vmcs12->host_fs_base = evmcs->host_fs_base;
1511 vmcs12->host_gs_base = evmcs->host_gs_base;
1512 vmcs12->host_tr_base = evmcs->host_tr_base;
1513 vmcs12->host_gdtr_base = evmcs->host_gdtr_base;
1514 vmcs12->host_idtr_base = evmcs->host_idtr_base;
1515 vmcs12->host_rsp = evmcs->host_rsp;
1516 }
1517
1518 if (unlikely(!(evmcs->hv_clean_fields &
1519 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT))) {
1520 vmcs12->ept_pointer = evmcs->ept_pointer;
1521 vmcs12->virtual_processor_id = evmcs->virtual_processor_id;
1522 }
1523
1524 if (unlikely(!(evmcs->hv_clean_fields &
1525 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1))) {
1526 vmcs12->vmcs_link_pointer = evmcs->vmcs_link_pointer;
1527 vmcs12->guest_ia32_debugctl = evmcs->guest_ia32_debugctl;
1528 vmcs12->guest_ia32_pat = evmcs->guest_ia32_pat;
1529 vmcs12->guest_ia32_efer = evmcs->guest_ia32_efer;
1530 vmcs12->guest_pdptr0 = evmcs->guest_pdptr0;
1531 vmcs12->guest_pdptr1 = evmcs->guest_pdptr1;
1532 vmcs12->guest_pdptr2 = evmcs->guest_pdptr2;
1533 vmcs12->guest_pdptr3 = evmcs->guest_pdptr3;
1534 vmcs12->guest_pending_dbg_exceptions =
1535 evmcs->guest_pending_dbg_exceptions;
1536 vmcs12->guest_sysenter_esp = evmcs->guest_sysenter_esp;
1537 vmcs12->guest_sysenter_eip = evmcs->guest_sysenter_eip;
1538 vmcs12->guest_bndcfgs = evmcs->guest_bndcfgs;
1539 vmcs12->guest_activity_state = evmcs->guest_activity_state;
1540 vmcs12->guest_sysenter_cs = evmcs->guest_sysenter_cs;
1541 }
1542
1543 /*
1544 * Not used?
1545 * vmcs12->vm_exit_msr_store_addr = evmcs->vm_exit_msr_store_addr;
1546 * vmcs12->vm_exit_msr_load_addr = evmcs->vm_exit_msr_load_addr;
1547 * vmcs12->vm_entry_msr_load_addr = evmcs->vm_entry_msr_load_addr;
1548 * vmcs12->cr3_target_value0 = evmcs->cr3_target_value0;
1549 * vmcs12->cr3_target_value1 = evmcs->cr3_target_value1;
1550 * vmcs12->cr3_target_value2 = evmcs->cr3_target_value2;
1551 * vmcs12->cr3_target_value3 = evmcs->cr3_target_value3;
1552 * vmcs12->page_fault_error_code_mask =
1553 * evmcs->page_fault_error_code_mask;
1554 * vmcs12->page_fault_error_code_match =
1555 * evmcs->page_fault_error_code_match;
1556 * vmcs12->cr3_target_count = evmcs->cr3_target_count;
1557 * vmcs12->vm_exit_msr_store_count = evmcs->vm_exit_msr_store_count;
1558 * vmcs12->vm_exit_msr_load_count = evmcs->vm_exit_msr_load_count;
1559 * vmcs12->vm_entry_msr_load_count = evmcs->vm_entry_msr_load_count;
1560 */
1561
1562 /*
1563 * Read only fields:
1564 * vmcs12->guest_physical_address = evmcs->guest_physical_address;
1565 * vmcs12->vm_instruction_error = evmcs->vm_instruction_error;
1566 * vmcs12->vm_exit_reason = evmcs->vm_exit_reason;
1567 * vmcs12->vm_exit_intr_info = evmcs->vm_exit_intr_info;
1568 * vmcs12->vm_exit_intr_error_code = evmcs->vm_exit_intr_error_code;
1569 * vmcs12->idt_vectoring_info_field = evmcs->idt_vectoring_info_field;
1570 * vmcs12->idt_vectoring_error_code = evmcs->idt_vectoring_error_code;
1571 * vmcs12->vm_exit_instruction_len = evmcs->vm_exit_instruction_len;
1572 * vmcs12->vmx_instruction_info = evmcs->vmx_instruction_info;
1573 * vmcs12->exit_qualification = evmcs->exit_qualification;
1574 * vmcs12->guest_linear_address = evmcs->guest_linear_address;
1575 *
1576 * Not present in struct vmcs12:
1577 * vmcs12->exit_io_instruction_ecx = evmcs->exit_io_instruction_ecx;
1578 * vmcs12->exit_io_instruction_esi = evmcs->exit_io_instruction_esi;
1579 * vmcs12->exit_io_instruction_edi = evmcs->exit_io_instruction_edi;
1580 * vmcs12->exit_io_instruction_eip = evmcs->exit_io_instruction_eip;
1581 */
1582
1583 return 0;
1584}
1585
1586static int copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx)
1587{
1588 struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12;
1589 struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs;
1590
1591 /*
1592 * Should not be changed by KVM:
1593 *
1594 * evmcs->host_es_selector = vmcs12->host_es_selector;
1595 * evmcs->host_cs_selector = vmcs12->host_cs_selector;
1596 * evmcs->host_ss_selector = vmcs12->host_ss_selector;
1597 * evmcs->host_ds_selector = vmcs12->host_ds_selector;
1598 * evmcs->host_fs_selector = vmcs12->host_fs_selector;
1599 * evmcs->host_gs_selector = vmcs12->host_gs_selector;
1600 * evmcs->host_tr_selector = vmcs12->host_tr_selector;
1601 * evmcs->host_ia32_pat = vmcs12->host_ia32_pat;
1602 * evmcs->host_ia32_efer = vmcs12->host_ia32_efer;
1603 * evmcs->host_cr0 = vmcs12->host_cr0;
1604 * evmcs->host_cr3 = vmcs12->host_cr3;
1605 * evmcs->host_cr4 = vmcs12->host_cr4;
1606 * evmcs->host_ia32_sysenter_esp = vmcs12->host_ia32_sysenter_esp;
1607 * evmcs->host_ia32_sysenter_eip = vmcs12->host_ia32_sysenter_eip;
1608 * evmcs->host_rip = vmcs12->host_rip;
1609 * evmcs->host_ia32_sysenter_cs = vmcs12->host_ia32_sysenter_cs;
1610 * evmcs->host_fs_base = vmcs12->host_fs_base;
1611 * evmcs->host_gs_base = vmcs12->host_gs_base;
1612 * evmcs->host_tr_base = vmcs12->host_tr_base;
1613 * evmcs->host_gdtr_base = vmcs12->host_gdtr_base;
1614 * evmcs->host_idtr_base = vmcs12->host_idtr_base;
1615 * evmcs->host_rsp = vmcs12->host_rsp;
1616 * sync_vmcs12() doesn't read these:
1617 * evmcs->io_bitmap_a = vmcs12->io_bitmap_a;
1618 * evmcs->io_bitmap_b = vmcs12->io_bitmap_b;
1619 * evmcs->msr_bitmap = vmcs12->msr_bitmap;
1620 * evmcs->ept_pointer = vmcs12->ept_pointer;
1621 * evmcs->xss_exit_bitmap = vmcs12->xss_exit_bitmap;
1622 * evmcs->vm_exit_msr_store_addr = vmcs12->vm_exit_msr_store_addr;
1623 * evmcs->vm_exit_msr_load_addr = vmcs12->vm_exit_msr_load_addr;
1624 * evmcs->vm_entry_msr_load_addr = vmcs12->vm_entry_msr_load_addr;
1625 * evmcs->cr3_target_value0 = vmcs12->cr3_target_value0;
1626 * evmcs->cr3_target_value1 = vmcs12->cr3_target_value1;
1627 * evmcs->cr3_target_value2 = vmcs12->cr3_target_value2;
1628 * evmcs->cr3_target_value3 = vmcs12->cr3_target_value3;
1629 * evmcs->tpr_threshold = vmcs12->tpr_threshold;
1630 * evmcs->virtual_processor_id = vmcs12->virtual_processor_id;
1631 * evmcs->exception_bitmap = vmcs12->exception_bitmap;
1632 * evmcs->vmcs_link_pointer = vmcs12->vmcs_link_pointer;
1633 * evmcs->pin_based_vm_exec_control = vmcs12->pin_based_vm_exec_control;
1634 * evmcs->vm_exit_controls = vmcs12->vm_exit_controls;
1635 * evmcs->secondary_vm_exec_control = vmcs12->secondary_vm_exec_control;
1636 * evmcs->page_fault_error_code_mask =
1637 * vmcs12->page_fault_error_code_mask;
1638 * evmcs->page_fault_error_code_match =
1639 * vmcs12->page_fault_error_code_match;
1640 * evmcs->cr3_target_count = vmcs12->cr3_target_count;
1641 * evmcs->virtual_apic_page_addr = vmcs12->virtual_apic_page_addr;
1642 * evmcs->tsc_offset = vmcs12->tsc_offset;
1643 * evmcs->guest_ia32_debugctl = vmcs12->guest_ia32_debugctl;
1644 * evmcs->cr0_guest_host_mask = vmcs12->cr0_guest_host_mask;
1645 * evmcs->cr4_guest_host_mask = vmcs12->cr4_guest_host_mask;
1646 * evmcs->cr0_read_shadow = vmcs12->cr0_read_shadow;
1647 * evmcs->cr4_read_shadow = vmcs12->cr4_read_shadow;
1648 * evmcs->vm_exit_msr_store_count = vmcs12->vm_exit_msr_store_count;
1649 * evmcs->vm_exit_msr_load_count = vmcs12->vm_exit_msr_load_count;
1650 * evmcs->vm_entry_msr_load_count = vmcs12->vm_entry_msr_load_count;
1651 *
1652 * Not present in struct vmcs12:
1653 * evmcs->exit_io_instruction_ecx = vmcs12->exit_io_instruction_ecx;
1654 * evmcs->exit_io_instruction_esi = vmcs12->exit_io_instruction_esi;
1655 * evmcs->exit_io_instruction_edi = vmcs12->exit_io_instruction_edi;
1656 * evmcs->exit_io_instruction_eip = vmcs12->exit_io_instruction_eip;
1657 */
1658
1659 evmcs->guest_es_selector = vmcs12->guest_es_selector;
1660 evmcs->guest_cs_selector = vmcs12->guest_cs_selector;
1661 evmcs->guest_ss_selector = vmcs12->guest_ss_selector;
1662 evmcs->guest_ds_selector = vmcs12->guest_ds_selector;
1663 evmcs->guest_fs_selector = vmcs12->guest_fs_selector;
1664 evmcs->guest_gs_selector = vmcs12->guest_gs_selector;
1665 evmcs->guest_ldtr_selector = vmcs12->guest_ldtr_selector;
1666 evmcs->guest_tr_selector = vmcs12->guest_tr_selector;
1667
1668 evmcs->guest_es_limit = vmcs12->guest_es_limit;
1669 evmcs->guest_cs_limit = vmcs12->guest_cs_limit;
1670 evmcs->guest_ss_limit = vmcs12->guest_ss_limit;
1671 evmcs->guest_ds_limit = vmcs12->guest_ds_limit;
1672 evmcs->guest_fs_limit = vmcs12->guest_fs_limit;
1673 evmcs->guest_gs_limit = vmcs12->guest_gs_limit;
1674 evmcs->guest_ldtr_limit = vmcs12->guest_ldtr_limit;
1675 evmcs->guest_tr_limit = vmcs12->guest_tr_limit;
1676 evmcs->guest_gdtr_limit = vmcs12->guest_gdtr_limit;
1677 evmcs->guest_idtr_limit = vmcs12->guest_idtr_limit;
1678
1679 evmcs->guest_es_ar_bytes = vmcs12->guest_es_ar_bytes;
1680 evmcs->guest_cs_ar_bytes = vmcs12->guest_cs_ar_bytes;
1681 evmcs->guest_ss_ar_bytes = vmcs12->guest_ss_ar_bytes;
1682 evmcs->guest_ds_ar_bytes = vmcs12->guest_ds_ar_bytes;
1683 evmcs->guest_fs_ar_bytes = vmcs12->guest_fs_ar_bytes;
1684 evmcs->guest_gs_ar_bytes = vmcs12->guest_gs_ar_bytes;
1685 evmcs->guest_ldtr_ar_bytes = vmcs12->guest_ldtr_ar_bytes;
1686 evmcs->guest_tr_ar_bytes = vmcs12->guest_tr_ar_bytes;
1687
1688 evmcs->guest_es_base = vmcs12->guest_es_base;
1689 evmcs->guest_cs_base = vmcs12->guest_cs_base;
1690 evmcs->guest_ss_base = vmcs12->guest_ss_base;
1691 evmcs->guest_ds_base = vmcs12->guest_ds_base;
1692 evmcs->guest_fs_base = vmcs12->guest_fs_base;
1693 evmcs->guest_gs_base = vmcs12->guest_gs_base;
1694 evmcs->guest_ldtr_base = vmcs12->guest_ldtr_base;
1695 evmcs->guest_tr_base = vmcs12->guest_tr_base;
1696 evmcs->guest_gdtr_base = vmcs12->guest_gdtr_base;
1697 evmcs->guest_idtr_base = vmcs12->guest_idtr_base;
1698
1699 evmcs->guest_ia32_pat = vmcs12->guest_ia32_pat;
1700 evmcs->guest_ia32_efer = vmcs12->guest_ia32_efer;
1701
1702 evmcs->guest_pdptr0 = vmcs12->guest_pdptr0;
1703 evmcs->guest_pdptr1 = vmcs12->guest_pdptr1;
1704 evmcs->guest_pdptr2 = vmcs12->guest_pdptr2;
1705 evmcs->guest_pdptr3 = vmcs12->guest_pdptr3;
1706
1707 evmcs->guest_pending_dbg_exceptions =
1708 vmcs12->guest_pending_dbg_exceptions;
1709 evmcs->guest_sysenter_esp = vmcs12->guest_sysenter_esp;
1710 evmcs->guest_sysenter_eip = vmcs12->guest_sysenter_eip;
1711
1712 evmcs->guest_activity_state = vmcs12->guest_activity_state;
1713 evmcs->guest_sysenter_cs = vmcs12->guest_sysenter_cs;
1714
1715 evmcs->guest_cr0 = vmcs12->guest_cr0;
1716 evmcs->guest_cr3 = vmcs12->guest_cr3;
1717 evmcs->guest_cr4 = vmcs12->guest_cr4;
1718 evmcs->guest_dr7 = vmcs12->guest_dr7;
1719
1720 evmcs->guest_physical_address = vmcs12->guest_physical_address;
1721
1722 evmcs->vm_instruction_error = vmcs12->vm_instruction_error;
1723 evmcs->vm_exit_reason = vmcs12->vm_exit_reason;
1724 evmcs->vm_exit_intr_info = vmcs12->vm_exit_intr_info;
1725 evmcs->vm_exit_intr_error_code = vmcs12->vm_exit_intr_error_code;
1726 evmcs->idt_vectoring_info_field = vmcs12->idt_vectoring_info_field;
1727 evmcs->idt_vectoring_error_code = vmcs12->idt_vectoring_error_code;
1728 evmcs->vm_exit_instruction_len = vmcs12->vm_exit_instruction_len;
1729 evmcs->vmx_instruction_info = vmcs12->vmx_instruction_info;
1730
1731 evmcs->exit_qualification = vmcs12->exit_qualification;
1732
1733 evmcs->guest_linear_address = vmcs12->guest_linear_address;
1734 evmcs->guest_rsp = vmcs12->guest_rsp;
1735 evmcs->guest_rflags = vmcs12->guest_rflags;
1736
1737 evmcs->guest_interruptibility_info =
1738 vmcs12->guest_interruptibility_info;
1739 evmcs->cpu_based_vm_exec_control = vmcs12->cpu_based_vm_exec_control;
1740 evmcs->vm_entry_controls = vmcs12->vm_entry_controls;
1741 evmcs->vm_entry_intr_info_field = vmcs12->vm_entry_intr_info_field;
1742 evmcs->vm_entry_exception_error_code =
1743 vmcs12->vm_entry_exception_error_code;
1744 evmcs->vm_entry_instruction_len = vmcs12->vm_entry_instruction_len;
1745
1746 evmcs->guest_rip = vmcs12->guest_rip;
1747
1748 evmcs->guest_bndcfgs = vmcs12->guest_bndcfgs;
1749
1750 return 0;
1751}
1752
1753/*
1754 * This is an equivalent of the nested hypervisor executing the vmptrld
1755 * instruction.
1756 */
1757static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu,
1758 bool from_launch)
1759{
1760 struct vcpu_vmx *vmx = to_vmx(vcpu);
1761 struct hv_vp_assist_page assist_page;
1762
1763 if (likely(!vmx->nested.enlightened_vmcs_enabled))
1764 return 1;
1765
1766 if (unlikely(!kvm_hv_get_assist_page(vcpu, &assist_page)))
1767 return 1;
1768
1769 if (unlikely(!assist_page.enlighten_vmentry))
1770 return 1;
1771
1772 if (unlikely(assist_page.current_nested_vmcs !=
1773 vmx->nested.hv_evmcs_vmptr)) {
1774
1775 if (!vmx->nested.hv_evmcs)
1776 vmx->nested.current_vmptr = -1ull;
1777
1778 nested_release_evmcs(vcpu);
1779
1780 vmx->nested.hv_evmcs_page = kvm_vcpu_gpa_to_page(
1781 vcpu, assist_page.current_nested_vmcs);
1782
1783 if (unlikely(is_error_page(vmx->nested.hv_evmcs_page)))
1784 return 0;
1785
1786 vmx->nested.hv_evmcs = kmap(vmx->nested.hv_evmcs_page);
1787
1788 /*
1789 * Currently, KVM only supports eVMCS version 1
1790 * (== KVM_EVMCS_VERSION) and thus we expect guest to set this
1791 * value to first u32 field of eVMCS which should specify eVMCS
1792 * VersionNumber.
1793 *
1794 * Guest should be aware of supported eVMCS versions by host by
1795 * examining CPUID.0x4000000A.EAX[0:15]. Host userspace VMM is
1796 * expected to set this CPUID leaf according to the value
1797 * returned in vmcs_version from nested_enable_evmcs().
1798 *
1799 * However, it turns out that Microsoft Hyper-V fails to comply
1800 * to their own invented interface: When Hyper-V use eVMCS, it
1801 * just sets first u32 field of eVMCS to revision_id specified
1802 * in MSR_IA32_VMX_BASIC. Instead of used eVMCS version number
1803 * which is one of the supported versions specified in
1804 * CPUID.0x4000000A.EAX[0:15].
1805 *
1806 * To overcome Hyper-V bug, we accept here either a supported
1807 * eVMCS version or VMCS12 revision_id as valid values for first
1808 * u32 field of eVMCS.
1809 */
1810 if ((vmx->nested.hv_evmcs->revision_id != KVM_EVMCS_VERSION) &&
1811 (vmx->nested.hv_evmcs->revision_id != VMCS12_REVISION)) {
1812 nested_release_evmcs(vcpu);
1813 return 0;
1814 }
1815
1816 vmx->nested.dirty_vmcs12 = true;
1817 /*
1818 * As we keep L2 state for one guest only 'hv_clean_fields' mask
1819 * can't be used when we switch between them. Reset it here for
1820 * simplicity.
1821 */
1822 vmx->nested.hv_evmcs->hv_clean_fields &=
1823 ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
1824 vmx->nested.hv_evmcs_vmptr = assist_page.current_nested_vmcs;
1825
1826 /*
1827 * Unlike normal vmcs12, enlightened vmcs12 is not fully
1828 * reloaded from guest's memory (read only fields, fields not
1829 * present in struct hv_enlightened_vmcs, ...). Make sure there
1830 * are no leftovers.
1831 */
1832 if (from_launch) {
1833 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1834 memset(vmcs12, 0, sizeof(*vmcs12));
1835 vmcs12->hdr.revision_id = VMCS12_REVISION;
1836 }
1837
1838 }
1839 return 1;
1840}
1841
1842void nested_sync_from_vmcs12(struct kvm_vcpu *vcpu)
1843{
1844 struct vcpu_vmx *vmx = to_vmx(vcpu);
1845
1846 /*
1847 * hv_evmcs may end up being not mapped after migration (when
1848 * L2 was running), map it here to make sure vmcs12 changes are
1849 * properly reflected.
1850 */
1851 if (vmx->nested.enlightened_vmcs_enabled && !vmx->nested.hv_evmcs)
1852 nested_vmx_handle_enlightened_vmptrld(vcpu, false);
1853
1854 if (vmx->nested.hv_evmcs) {
1855 copy_vmcs12_to_enlightened(vmx);
1856 /* All fields are clean */
1857 vmx->nested.hv_evmcs->hv_clean_fields |=
1858 HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
1859 } else {
1860 copy_vmcs12_to_shadow(vmx);
1861 }
1862
1863 vmx->nested.need_vmcs12_sync = false;
1864}
1865
1866static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer)
1867{
1868 struct vcpu_vmx *vmx =
1869 container_of(timer, struct vcpu_vmx, nested.preemption_timer);
1870
1871 vmx->nested.preemption_timer_expired = true;
1872 kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu);
1873 kvm_vcpu_kick(&vmx->vcpu);
1874
1875 return HRTIMER_NORESTART;
1876}
1877
1878static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu)
1879{
1880 u64 preemption_timeout = get_vmcs12(vcpu)->vmx_preemption_timer_value;
1881 struct vcpu_vmx *vmx = to_vmx(vcpu);
1882
1883 /*
1884 * A timer value of zero is architecturally guaranteed to cause
1885 * a VMExit prior to executing any instructions in the guest.
1886 */
1887 if (preemption_timeout == 0) {
1888 vmx_preemption_timer_fn(&vmx->nested.preemption_timer);
1889 return;
1890 }
1891
1892 if (vcpu->arch.virtual_tsc_khz == 0)
1893 return;
1894
1895 preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
1896 preemption_timeout *= 1000000;
1897 do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz);
1898 hrtimer_start(&vmx->nested.preemption_timer,
1899 ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL);
1900}
1901
1902static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
1903{
1904 if (vmx->nested.nested_run_pending &&
1905 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER))
1906 return vmcs12->guest_ia32_efer;
1907 else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE)
1908 return vmx->vcpu.arch.efer | (EFER_LMA | EFER_LME);
1909 else
1910 return vmx->vcpu.arch.efer & ~(EFER_LMA | EFER_LME);
1911}
1912
1913static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx)
1914{
1915 /*
1916 * If vmcs02 hasn't been initialized, set the constant vmcs02 state
1917 * according to L0's settings (vmcs12 is irrelevant here). Host
1918 * fields that come from L0 and are not constant, e.g. HOST_CR3,
1919 * will be set as needed prior to VMLAUNCH/VMRESUME.
1920 */
1921 if (vmx->nested.vmcs02_initialized)
1922 return;
1923 vmx->nested.vmcs02_initialized = true;
1924
1925 /*
1926 * We don't care what the EPTP value is we just need to guarantee
1927 * it's valid so we don't get a false positive when doing early
1928 * consistency checks.
1929 */
1930 if (enable_ept && nested_early_check)
1931 vmcs_write64(EPT_POINTER, construct_eptp(&vmx->vcpu, 0));
1932
1933 /* All VMFUNCs are currently emulated through L0 vmexits. */
1934 if (cpu_has_vmx_vmfunc())
1935 vmcs_write64(VM_FUNCTION_CONTROL, 0);
1936
1937 if (cpu_has_vmx_posted_intr())
1938 vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR);
1939
1940 if (cpu_has_vmx_msr_bitmap())
1941 vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap));
1942
1943 if (enable_pml)
1944 vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
1945
1946 /*
1947 * Set the MSR load/store lists to match L0's settings. Only the
1948 * addresses are constant (for vmcs02), the counts can change based
1949 * on L2's behavior, e.g. switching to/from long mode.
1950 */
1951 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
1952 vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val));
1953 vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val));
1954
1955 vmx_set_constant_host_state(vmx);
1956}
1957
1958static void prepare_vmcs02_early_full(struct vcpu_vmx *vmx,
1959 struct vmcs12 *vmcs12)
1960{
1961 prepare_vmcs02_constant_state(vmx);
1962
1963 vmcs_write64(VMCS_LINK_POINTER, -1ull);
1964
1965 if (enable_vpid) {
1966 if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02)
1967 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02);
1968 else
1969 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
1970 }
1971}
1972
1973static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
1974{
1975 u32 exec_control, vmcs12_exec_ctrl;
1976 u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12);
1977
1978 if (vmx->nested.dirty_vmcs12 || vmx->nested.hv_evmcs)
1979 prepare_vmcs02_early_full(vmx, vmcs12);
1980
1981 /*
1982 * HOST_RSP is normally set correctly in vmx_vcpu_run() just before
1983 * entry, but only if the current (host) sp changed from the value
1984 * we wrote last (vmx->host_rsp). This cache is no longer relevant
1985 * if we switch vmcs, and rather than hold a separate cache per vmcs,
1986 * here we just force the write to happen on entry. host_rsp will
1987 * also be written unconditionally by nested_vmx_check_vmentry_hw()
1988 * if we are doing early consistency checks via hardware.
1989 */
1990 vmx->host_rsp = 0;
1991
1992 /*
1993 * PIN CONTROLS
1994 */
1995 exec_control = vmcs12->pin_based_vm_exec_control;
1996
1997 /* Preemption timer setting is computed directly in vmx_vcpu_run. */
1998 exec_control |= vmcs_config.pin_based_exec_ctrl;
1999 exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
2000 vmx->loaded_vmcs->hv_timer_armed = false;
2001
2002 /* Posted interrupts setting is only taken from vmcs12. */
2003 if (nested_cpu_has_posted_intr(vmcs12)) {
2004 vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv;
2005 vmx->nested.pi_pending = false;
2006 } else {
2007 exec_control &= ~PIN_BASED_POSTED_INTR;
2008 }
2009 vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, exec_control);
2010
2011 /*
2012 * EXEC CONTROLS
2013 */
2014 exec_control = vmx_exec_control(vmx); /* L0's desires */
2015 exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
2016 exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
2017 exec_control &= ~CPU_BASED_TPR_SHADOW;
2018 exec_control |= vmcs12->cpu_based_vm_exec_control;
2019
2020 /*
2021 * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR. Later, if
2022 * nested_get_vmcs12_pages can't fix it up, the illegal value
2023 * will result in a VM entry failure.
2024 */
2025 if (exec_control & CPU_BASED_TPR_SHADOW) {
2026 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull);
2027 vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);
2028 } else {
2029#ifdef CONFIG_X86_64
2030 exec_control |= CPU_BASED_CR8_LOAD_EXITING |
2031 CPU_BASED_CR8_STORE_EXITING;
2032#endif
2033 }
2034
2035 /*
2036 * A vmexit (to either L1 hypervisor or L0 userspace) is always needed
2037 * for I/O port accesses.
2038 */
2039 exec_control &= ~CPU_BASED_USE_IO_BITMAPS;
2040 exec_control |= CPU_BASED_UNCOND_IO_EXITING;
2041 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
2042
2043 /*
2044 * SECONDARY EXEC CONTROLS
2045 */
2046 if (cpu_has_secondary_exec_ctrls()) {
2047 exec_control = vmx->secondary_exec_control;
2048
2049 /* Take the following fields only from vmcs12 */
2050 exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
2051 SECONDARY_EXEC_ENABLE_INVPCID |
2052 SECONDARY_EXEC_RDTSCP |
2053 SECONDARY_EXEC_XSAVES |
2054 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
2055 SECONDARY_EXEC_APIC_REGISTER_VIRT |
2056 SECONDARY_EXEC_ENABLE_VMFUNC);
2057 if (nested_cpu_has(vmcs12,
2058 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) {
2059 vmcs12_exec_ctrl = vmcs12->secondary_vm_exec_control &
2060 ~SECONDARY_EXEC_ENABLE_PML;
2061 exec_control |= vmcs12_exec_ctrl;
2062 }
2063
2064 /* VMCS shadowing for L2 is emulated for now */
2065 exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
2066
2067 if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
2068 vmcs_write16(GUEST_INTR_STATUS,
2069 vmcs12->guest_intr_status);
2070
2071 /*
2072 * Write an illegal value to APIC_ACCESS_ADDR. Later,
2073 * nested_get_vmcs12_pages will either fix it up or
2074 * remove the VM execution control.
2075 */
2076 if (exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)
2077 vmcs_write64(APIC_ACCESS_ADDR, -1ull);
2078
2079 if (exec_control & SECONDARY_EXEC_ENCLS_EXITING)
2080 vmcs_write64(ENCLS_EXITING_BITMAP, -1ull);
2081
2082 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
2083 }
2084
2085 /*
2086 * ENTRY CONTROLS
2087 *
2088 * vmcs12's VM_{ENTRY,EXIT}_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE
2089 * are emulated by vmx_set_efer() in prepare_vmcs02(), but speculate
2090 * on the related bits (if supported by the CPU) in the hope that
2091 * we can avoid VMWrites during vmx_set_efer().
2092 */
2093 exec_control = (vmcs12->vm_entry_controls | vmx_vmentry_ctrl()) &
2094 ~VM_ENTRY_IA32E_MODE & ~VM_ENTRY_LOAD_IA32_EFER;
2095 if (cpu_has_load_ia32_efer()) {
2096 if (guest_efer & EFER_LMA)
2097 exec_control |= VM_ENTRY_IA32E_MODE;
2098 if (guest_efer != host_efer)
2099 exec_control |= VM_ENTRY_LOAD_IA32_EFER;
2100 }
2101 vm_entry_controls_init(vmx, exec_control);
2102
2103 /*
2104 * EXIT CONTROLS
2105 *
2106 * L2->L1 exit controls are emulated - the hardware exit is to L0 so
2107 * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
2108 * bits may be modified by vmx_set_efer() in prepare_vmcs02().
2109 */
2110 exec_control = vmx_vmexit_ctrl();
2111 if (cpu_has_load_ia32_efer() && guest_efer != host_efer)
2112 exec_control |= VM_EXIT_LOAD_IA32_EFER;
2113 vm_exit_controls_init(vmx, exec_control);
2114
2115 /*
2116 * Conceptually we want to copy the PML address and index from
2117 * vmcs01 here, and then back to vmcs01 on nested vmexit. But,
2118 * since we always flush the log on each vmexit and never change
2119 * the PML address (once set), this happens to be equivalent to
2120 * simply resetting the index in vmcs02.
2121 */
2122 if (enable_pml)
2123 vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
2124
2125 /*
2126 * Interrupt/Exception Fields
2127 */
2128 if (vmx->nested.nested_run_pending) {
2129 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2130 vmcs12->vm_entry_intr_info_field);
2131 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
2132 vmcs12->vm_entry_exception_error_code);
2133 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
2134 vmcs12->vm_entry_instruction_len);
2135 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
2136 vmcs12->guest_interruptibility_info);
2137 vmx->loaded_vmcs->nmi_known_unmasked =
2138 !(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI);
2139 } else {
2140 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
2141 }
2142}
2143
2144static void prepare_vmcs02_full(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
2145{
2146 struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs;
2147
2148 if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
2149 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) {
2150 vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
2151 vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
2152 vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector);
2153 vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector);
2154 vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector);
2155 vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector);
2156 vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector);
2157 vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector);
2158 vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit);
2159 vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit);
2160 vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit);
2161 vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit);
2162 vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit);
2163 vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit);
2164 vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit);
2165 vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit);
2166 vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit);
2167 vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit);
2168 vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes);
2169 vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes);
2170 vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes);
2171 vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes);
2172 vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes);
2173 vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes);
2174 vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base);
2175 vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base);
2176 vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base);
2177 vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base);
2178 vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base);
2179 vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base);
2180 vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base);
2181 vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base);
2182 vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base);
2183 vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base);
2184 }
2185
2186 if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
2187 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1)) {
2188 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs);
2189 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
2190 vmcs12->guest_pending_dbg_exceptions);
2191 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp);
2192 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip);
2193
2194 /*
2195 * L1 may access the L2's PDPTR, so save them to construct
2196 * vmcs12
2197 */
2198 if (enable_ept) {
2199 vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
2200 vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
2201 vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
2202 vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
2203 }
2204 }
2205
2206 if (nested_cpu_has_xsaves(vmcs12))
2207 vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap);
2208
2209 /*
2210 * Whether page-faults are trapped is determined by a combination of
2211 * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF.
2212 * If enable_ept, L0 doesn't care about page faults and we should
2213 * set all of these to L1's desires. However, if !enable_ept, L0 does
2214 * care about (at least some) page faults, and because it is not easy
2215 * (if at all possible?) to merge L0 and L1's desires, we simply ask
2216 * to exit on each and every L2 page fault. This is done by setting
2217 * MASK=MATCH=0 and (see below) EB.PF=1.
2218 * Note that below we don't need special code to set EB.PF beyond the
2219 * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept,
2220 * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when
2221 * !enable_ept, EB.PF is 1, so the "or" will always be 1.
2222 */
2223 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK,
2224 enable_ept ? vmcs12->page_fault_error_code_mask : 0);
2225 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH,
2226 enable_ept ? vmcs12->page_fault_error_code_match : 0);
2227
2228 if (cpu_has_vmx_apicv()) {
2229 vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0);
2230 vmcs_write64(EOI_EXIT_BITMAP1, vmcs12->eoi_exit_bitmap1);
2231 vmcs_write64(EOI_EXIT_BITMAP2, vmcs12->eoi_exit_bitmap2);
2232 vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3);
2233 }
2234
2235 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
2236 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
2237
2238 set_cr4_guest_host_mask(vmx);
2239
2240 if (kvm_mpx_supported()) {
2241 if (vmx->nested.nested_run_pending &&
2242 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
2243 vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
2244 else
2245 vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs);
2246 }
2247}
2248
2249/*
2250 * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
2251 * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
2252 * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2
2253 * guest in a way that will both be appropriate to L1's requests, and our
2254 * needs. In addition to modifying the active vmcs (which is vmcs02), this
2255 * function also has additional necessary side-effects, like setting various
2256 * vcpu->arch fields.
2257 * Returns 0 on success, 1 on failure. Invalid state exit qualification code
2258 * is assigned to entry_failure_code on failure.
2259 */
2260static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
2261 u32 *entry_failure_code)
2262{
2263 struct vcpu_vmx *vmx = to_vmx(vcpu);
2264 struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs;
2265
2266 if (vmx->nested.dirty_vmcs12 || vmx->nested.hv_evmcs) {
2267 prepare_vmcs02_full(vmx, vmcs12);
2268 vmx->nested.dirty_vmcs12 = false;
2269 }
2270
2271 /*
2272 * First, the fields that are shadowed. This must be kept in sync
2273 * with vmcs_shadow_fields.h.
2274 */
2275 if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
2276 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) {
2277 vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes);
2278 vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes);
2279 }
2280
2281 if (vmx->nested.nested_run_pending &&
2282 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) {
2283 kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
2284 vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl);
2285 } else {
2286 kvm_set_dr(vcpu, 7, vcpu->arch.dr7);
2287 vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl);
2288 }
2289 vmx_set_rflags(vcpu, vmcs12->guest_rflags);
2290
2291 vmx->nested.preemption_timer_expired = false;
2292 if (nested_cpu_has_preemption_timer(vmcs12))
2293 vmx_start_preemption_timer(vcpu);
2294
2295 /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the
2296 * bitwise-or of what L1 wants to trap for L2, and what we want to
2297 * trap. Note that CR0.TS also needs updating - we do this later.
2298 */
2299 update_exception_bitmap(vcpu);
2300 vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask;
2301 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
2302
2303 if (vmx->nested.nested_run_pending &&
2304 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) {
2305 vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat);
2306 vcpu->arch.pat = vmcs12->guest_ia32_pat;
2307 } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
2308 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
2309 }
2310
2311 vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
2312
2313 if (kvm_has_tsc_control)
2314 decache_tsc_multiplier(vmx);
2315
2316 if (enable_vpid) {
2317 /*
2318 * There is no direct mapping between vpid02 and vpid12, the
2319 * vpid02 is per-vCPU for L0 and reused while the value of
2320 * vpid12 is changed w/ one invvpid during nested vmentry.
2321 * The vpid12 is allocated by L1 for L2, so it will not
2322 * influence global bitmap(for vpid01 and vpid02 allocation)
2323 * even if spawn a lot of nested vCPUs.
2324 */
2325 if (nested_cpu_has_vpid(vmcs12) && nested_has_guest_tlb_tag(vcpu)) {
2326 if (vmcs12->virtual_processor_id != vmx->nested.last_vpid) {
2327 vmx->nested.last_vpid = vmcs12->virtual_processor_id;
2328 __vmx_flush_tlb(vcpu, nested_get_vpid02(vcpu), false);
2329 }
2330 } else {
2331 /*
2332 * If L1 use EPT, then L0 needs to execute INVEPT on
2333 * EPTP02 instead of EPTP01. Therefore, delay TLB
2334 * flush until vmcs02->eptp is fully updated by
2335 * KVM_REQ_LOAD_CR3. Note that this assumes
2336 * KVM_REQ_TLB_FLUSH is evaluated after
2337 * KVM_REQ_LOAD_CR3 in vcpu_enter_guest().
2338 */
2339 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
2340 }
2341 }
2342
2343 if (nested_cpu_has_ept(vmcs12))
2344 nested_ept_init_mmu_context(vcpu);
2345 else if (nested_cpu_has2(vmcs12,
2346 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
2347 vmx_flush_tlb(vcpu, true);
2348
2349 /*
2350 * This sets GUEST_CR0 to vmcs12->guest_cr0, possibly modifying those
2351 * bits which we consider mandatory enabled.
2352 * The CR0_READ_SHADOW is what L2 should have expected to read given
2353 * the specifications by L1; It's not enough to take
2354 * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we we
2355 * have more bits than L1 expected.
2356 */
2357 vmx_set_cr0(vcpu, vmcs12->guest_cr0);
2358 vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12));
2359
2360 vmx_set_cr4(vcpu, vmcs12->guest_cr4);
2361 vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12));
2362
2363 vcpu->arch.efer = nested_vmx_calc_efer(vmx, vmcs12);
2364 /* Note: may modify VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */
2365 vmx_set_efer(vcpu, vcpu->arch.efer);
2366
2367 /*
2368 * Guest state is invalid and unrestricted guest is disabled,
2369 * which means L1 attempted VMEntry to L2 with invalid state.
2370 * Fail the VMEntry.
2371 */
2372 if (vmx->emulation_required) {
2373 *entry_failure_code = ENTRY_FAIL_DEFAULT;
2374 return 1;
2375 }
2376
2377 /* Shadow page tables on either EPT or shadow page tables. */
2378 if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12),
2379 entry_failure_code))
2380 return 1;
2381
2382 if (!enable_ept)
2383 vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested;
2384
2385 kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->guest_rsp);
2386 kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip);
2387 return 0;
2388}
2389
2390static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12)
2391{
2392 if (!nested_cpu_has_nmi_exiting(vmcs12) &&
2393 nested_cpu_has_virtual_nmis(vmcs12))
2394 return -EINVAL;
2395
2396 if (!nested_cpu_has_virtual_nmis(vmcs12) &&
2397 nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING))
2398 return -EINVAL;
2399
2400 return 0;
2401}
2402
2403static bool valid_ept_address(struct kvm_vcpu *vcpu, u64 address)
2404{
2405 struct vcpu_vmx *vmx = to_vmx(vcpu);
2406 int maxphyaddr = cpuid_maxphyaddr(vcpu);
2407
2408 /* Check for memory type validity */
2409 switch (address & VMX_EPTP_MT_MASK) {
2410 case VMX_EPTP_MT_UC:
2411 if (!(vmx->nested.msrs.ept_caps & VMX_EPTP_UC_BIT))
2412 return false;
2413 break;
2414 case VMX_EPTP_MT_WB:
2415 if (!(vmx->nested.msrs.ept_caps & VMX_EPTP_WB_BIT))
2416 return false;
2417 break;
2418 default:
2419 return false;
2420 }
2421
2422 /* only 4 levels page-walk length are valid */
2423 if ((address & VMX_EPTP_PWL_MASK) != VMX_EPTP_PWL_4)
2424 return false;
2425
2426 /* Reserved bits should not be set */
2427 if (address >> maxphyaddr || ((address >> 7) & 0x1f))
2428 return false;
2429
2430 /* AD, if set, should be supported */
2431 if (address & VMX_EPTP_AD_ENABLE_BIT) {
2432 if (!(vmx->nested.msrs.ept_caps & VMX_EPT_AD_BIT))
2433 return false;
2434 }
2435
2436 return true;
2437}
2438
2439/*
2440 * Checks related to VM-Execution Control Fields
2441 */
2442static int nested_check_vm_execution_controls(struct kvm_vcpu *vcpu,
2443 struct vmcs12 *vmcs12)
2444{
2445 struct vcpu_vmx *vmx = to_vmx(vcpu);
2446
2447 if (!vmx_control_verify(vmcs12->pin_based_vm_exec_control,
2448 vmx->nested.msrs.pinbased_ctls_low,
2449 vmx->nested.msrs.pinbased_ctls_high) ||
2450 !vmx_control_verify(vmcs12->cpu_based_vm_exec_control,
2451 vmx->nested.msrs.procbased_ctls_low,
2452 vmx->nested.msrs.procbased_ctls_high))
2453 return -EINVAL;
2454
2455 if (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) &&
2456 !vmx_control_verify(vmcs12->secondary_vm_exec_control,
2457 vmx->nested.msrs.secondary_ctls_low,
2458 vmx->nested.msrs.secondary_ctls_high))
2459 return -EINVAL;
2460
2461 if (vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu) ||
2462 nested_vmx_check_io_bitmap_controls(vcpu, vmcs12) ||
2463 nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12) ||
2464 nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12) ||
2465 nested_vmx_check_apic_access_controls(vcpu, vmcs12) ||
2466 nested_vmx_check_apicv_controls(vcpu, vmcs12) ||
2467 nested_vmx_check_nmi_controls(vmcs12) ||
2468 nested_vmx_check_pml_controls(vcpu, vmcs12) ||
2469 nested_vmx_check_unrestricted_guest_controls(vcpu, vmcs12) ||
2470 nested_vmx_check_mode_based_ept_exec_controls(vcpu, vmcs12) ||
2471 nested_vmx_check_shadow_vmcs_controls(vcpu, vmcs12) ||
2472 (nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id))
2473 return -EINVAL;
2474
2475 if (nested_cpu_has_ept(vmcs12) &&
2476 !valid_ept_address(vcpu, vmcs12->ept_pointer))
2477 return -EINVAL;
2478
2479 if (nested_cpu_has_vmfunc(vmcs12)) {
2480 if (vmcs12->vm_function_control &
2481 ~vmx->nested.msrs.vmfunc_controls)
2482 return -EINVAL;
2483
2484 if (nested_cpu_has_eptp_switching(vmcs12)) {
2485 if (!nested_cpu_has_ept(vmcs12) ||
2486 !page_address_valid(vcpu, vmcs12->eptp_list_address))
2487 return -EINVAL;
2488 }
2489 }
2490
2491 return 0;
2492}
2493
2494/*
2495 * Checks related to VM-Exit Control Fields
2496 */
2497static int nested_check_vm_exit_controls(struct kvm_vcpu *vcpu,
2498 struct vmcs12 *vmcs12)
2499{
2500 struct vcpu_vmx *vmx = to_vmx(vcpu);
2501
2502 if (!vmx_control_verify(vmcs12->vm_exit_controls,
2503 vmx->nested.msrs.exit_ctls_low,
2504 vmx->nested.msrs.exit_ctls_high) ||
2505 nested_vmx_check_exit_msr_switch_controls(vcpu, vmcs12))
2506 return -EINVAL;
2507
2508 return 0;
2509}
2510
2511/*
2512 * Checks related to VM-Entry Control Fields
2513 */
2514static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu,
2515 struct vmcs12 *vmcs12)
2516{
2517 struct vcpu_vmx *vmx = to_vmx(vcpu);
2518
2519 if (!vmx_control_verify(vmcs12->vm_entry_controls,
2520 vmx->nested.msrs.entry_ctls_low,
2521 vmx->nested.msrs.entry_ctls_high))
2522 return -EINVAL;
2523
2524 /*
2525 * From the Intel SDM, volume 3:
2526 * Fields relevant to VM-entry event injection must be set properly.
2527 * These fields are the VM-entry interruption-information field, the
2528 * VM-entry exception error code, and the VM-entry instruction length.
2529 */
2530 if (vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) {
2531 u32 intr_info = vmcs12->vm_entry_intr_info_field;
2532 u8 vector = intr_info & INTR_INFO_VECTOR_MASK;
2533 u32 intr_type = intr_info & INTR_INFO_INTR_TYPE_MASK;
2534 bool has_error_code = intr_info & INTR_INFO_DELIVER_CODE_MASK;
2535 bool should_have_error_code;
2536 bool urg = nested_cpu_has2(vmcs12,
2537 SECONDARY_EXEC_UNRESTRICTED_GUEST);
2538 bool prot_mode = !urg || vmcs12->guest_cr0 & X86_CR0_PE;
2539
2540 /* VM-entry interruption-info field: interruption type */
2541 if (intr_type == INTR_TYPE_RESERVED ||
2542 (intr_type == INTR_TYPE_OTHER_EVENT &&
2543 !nested_cpu_supports_monitor_trap_flag(vcpu)))
2544 return -EINVAL;
2545
2546 /* VM-entry interruption-info field: vector */
2547 if ((intr_type == INTR_TYPE_NMI_INTR && vector != NMI_VECTOR) ||
2548 (intr_type == INTR_TYPE_HARD_EXCEPTION && vector > 31) ||
2549 (intr_type == INTR_TYPE_OTHER_EVENT && vector != 0))
2550 return -EINVAL;
2551
2552 /* VM-entry interruption-info field: deliver error code */
2553 should_have_error_code =
2554 intr_type == INTR_TYPE_HARD_EXCEPTION && prot_mode &&
2555 x86_exception_has_error_code(vector);
2556 if (has_error_code != should_have_error_code)
2557 return -EINVAL;
2558
2559 /* VM-entry exception error code */
2560 if (has_error_code &&
2561 vmcs12->vm_entry_exception_error_code & GENMASK(31, 15))
2562 return -EINVAL;
2563
2564 /* VM-entry interruption-info field: reserved bits */
2565 if (intr_info & INTR_INFO_RESVD_BITS_MASK)
2566 return -EINVAL;
2567
2568 /* VM-entry instruction length */
2569 switch (intr_type) {
2570 case INTR_TYPE_SOFT_EXCEPTION:
2571 case INTR_TYPE_SOFT_INTR:
2572 case INTR_TYPE_PRIV_SW_EXCEPTION:
2573 if ((vmcs12->vm_entry_instruction_len > 15) ||
2574 (vmcs12->vm_entry_instruction_len == 0 &&
2575 !nested_cpu_has_zero_length_injection(vcpu)))
2576 return -EINVAL;
2577 }
2578 }
2579
2580 if (nested_vmx_check_entry_msr_switch_controls(vcpu, vmcs12))
2581 return -EINVAL;
2582
2583 return 0;
2584}
2585
2586/*
2587 * Checks related to Host Control Registers and MSRs
2588 */
2589static int nested_check_host_control_regs(struct kvm_vcpu *vcpu,
2590 struct vmcs12 *vmcs12)
2591{
2592 bool ia32e;
2593
2594 if (!nested_host_cr0_valid(vcpu, vmcs12->host_cr0) ||
2595 !nested_host_cr4_valid(vcpu, vmcs12->host_cr4) ||
2596 !nested_cr3_valid(vcpu, vmcs12->host_cr3))
2597 return -EINVAL;
2598 /*
2599 * If the load IA32_EFER VM-exit control is 1, bits reserved in the
2600 * IA32_EFER MSR must be 0 in the field for that register. In addition,
2601 * the values of the LMA and LME bits in the field must each be that of
2602 * the host address-space size VM-exit control.
2603 */
2604 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) {
2605 ia32e = (vmcs12->vm_exit_controls &
2606 VM_EXIT_HOST_ADDR_SPACE_SIZE) != 0;
2607 if (!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer) ||
2608 ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA) ||
2609 ia32e != !!(vmcs12->host_ia32_efer & EFER_LME))
2610 return -EINVAL;
2611 }
2612
2613 return 0;
2614}
2615
2616/*
2617 * Checks related to Guest Non-register State
2618 */
2619static int nested_check_guest_non_reg_state(struct vmcs12 *vmcs12)
2620{
2621 if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE &&
2622 vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT)
2623 return -EINVAL;
2624
2625 return 0;
2626}
2627
2628static int nested_vmx_check_vmentry_prereqs(struct kvm_vcpu *vcpu,
2629 struct vmcs12 *vmcs12)
2630{
2631 if (nested_check_vm_execution_controls(vcpu, vmcs12) ||
2632 nested_check_vm_exit_controls(vcpu, vmcs12) ||
2633 nested_check_vm_entry_controls(vcpu, vmcs12))
2634 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
2635
2636 if (nested_check_host_control_regs(vcpu, vmcs12))
2637 return VMXERR_ENTRY_INVALID_HOST_STATE_FIELD;
2638
2639 if (nested_check_guest_non_reg_state(vmcs12))
2640 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
2641
2642 return 0;
2643}
2644
2645static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu,
2646 struct vmcs12 *vmcs12)
2647{
2648 int r;
2649 struct page *page;
2650 struct vmcs12 *shadow;
2651
2652 if (vmcs12->vmcs_link_pointer == -1ull)
2653 return 0;
2654
2655 if (!page_address_valid(vcpu, vmcs12->vmcs_link_pointer))
2656 return -EINVAL;
2657
2658 page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->vmcs_link_pointer);
2659 if (is_error_page(page))
2660 return -EINVAL;
2661
2662 r = 0;
2663 shadow = kmap(page);
2664 if (shadow->hdr.revision_id != VMCS12_REVISION ||
2665 shadow->hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12))
2666 r = -EINVAL;
2667 kunmap(page);
2668 kvm_release_page_clean(page);
2669 return r;
2670}
2671
2672static int nested_vmx_check_vmentry_postreqs(struct kvm_vcpu *vcpu,
2673 struct vmcs12 *vmcs12,
2674 u32 *exit_qual)
2675{
2676 bool ia32e;
2677
2678 *exit_qual = ENTRY_FAIL_DEFAULT;
2679
2680 if (!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0) ||
2681 !nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4))
2682 return 1;
2683
2684 if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) {
2685 *exit_qual = ENTRY_FAIL_VMCS_LINK_PTR;
2686 return 1;
2687 }
2688
2689 /*
2690 * If the load IA32_EFER VM-entry control is 1, the following checks
2691 * are performed on the field for the IA32_EFER MSR:
2692 * - Bits reserved in the IA32_EFER MSR must be 0.
2693 * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of
2694 * the IA-32e mode guest VM-exit control. It must also be identical
2695 * to bit 8 (LME) if bit 31 in the CR0 field (corresponding to
2696 * CR0.PG) is 1.
2697 */
2698 if (to_vmx(vcpu)->nested.nested_run_pending &&
2699 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) {
2700 ia32e = (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) != 0;
2701 if (!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer) ||
2702 ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA) ||
2703 ((vmcs12->guest_cr0 & X86_CR0_PG) &&
2704 ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME)))
2705 return 1;
2706 }
2707
2708 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) &&
2709 (is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu) ||
2710 (vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD)))
2711 return 1;
2712
2713 return 0;
2714}
2715
2716static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
2717{
2718 struct vcpu_vmx *vmx = to_vmx(vcpu);
2719 unsigned long cr3, cr4;
2720
2721 if (!nested_early_check)
2722 return 0;
2723
2724 if (vmx->msr_autoload.host.nr)
2725 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
2726 if (vmx->msr_autoload.guest.nr)
2727 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
2728
2729 preempt_disable();
2730
2731 vmx_prepare_switch_to_guest(vcpu);
2732
2733 /*
2734 * Induce a consistency check VMExit by clearing bit 1 in GUEST_RFLAGS,
2735 * which is reserved to '1' by hardware. GUEST_RFLAGS is guaranteed to
2736 * be written (by preparve_vmcs02()) before the "real" VMEnter, i.e.
2737 * there is no need to preserve other bits or save/restore the field.
2738 */
2739 vmcs_writel(GUEST_RFLAGS, 0);
2740
2741 cr3 = __get_current_cr3_fast();
2742 if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) {
2743 vmcs_writel(HOST_CR3, cr3);
2744 vmx->loaded_vmcs->host_state.cr3 = cr3;
2745 }
2746
2747 cr4 = cr4_read_shadow();
2748 if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) {
2749 vmcs_writel(HOST_CR4, cr4);
2750 vmx->loaded_vmcs->host_state.cr4 = cr4;
2751 }
2752
2753 vmx->__launched = vmx->loaded_vmcs->launched;
2754
2755 asm(
2756 /* Set HOST_RSP */
2757 "sub $%c[wordsize], %%" _ASM_SP "\n\t" /* temporarily adjust RSP for CALL */
2758 __ex("vmwrite %%" _ASM_SP ", %%" _ASM_DX) "\n\t"
2759 "mov %%" _ASM_SP ", %c[host_rsp](%1)\n\t"
2760 "add $%c[wordsize], %%" _ASM_SP "\n\t" /* un-adjust RSP */
2761
2762 /* Check if vmlaunch or vmresume is needed */
2763 "cmpl $0, %c[launched](%% " _ASM_CX")\n\t"
2764
2765 "call vmx_vmenter\n\t"
2766
2767 /* Set vmx->fail accordingly */
2768 "setbe %c[fail](%% " _ASM_CX")\n\t"
2769 : ASM_CALL_CONSTRAINT
2770 : "c"(vmx), "d"((unsigned long)HOST_RSP),
2771 [launched]"i"(offsetof(struct vcpu_vmx, __launched)),
2772 [fail]"i"(offsetof(struct vcpu_vmx, fail)),
2773 [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)),
2774 [wordsize]"i"(sizeof(ulong))
2775 : "rax", "cc", "memory"
2776 );
2777
2778 preempt_enable();
2779
2780 if (vmx->msr_autoload.host.nr)
2781 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
2782 if (vmx->msr_autoload.guest.nr)
2783 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
2784
2785 if (vmx->fail) {
2786 WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) !=
2787 VMXERR_ENTRY_INVALID_CONTROL_FIELD);
2788 vmx->fail = 0;
2789 return 1;
2790 }
2791
2792 /*
2793 * VMExit clears RFLAGS.IF and DR7, even on a consistency check.
2794 */
2795 local_irq_enable();
2796 if (hw_breakpoint_active())
2797 set_debugreg(__this_cpu_read(cpu_dr7), 7);
2798
2799 /*
2800 * A non-failing VMEntry means we somehow entered guest mode with
2801 * an illegal RIP, and that's just the tip of the iceberg. There
2802 * is no telling what memory has been modified or what state has
2803 * been exposed to unknown code. Hitting this all but guarantees
2804 * a (very critical) hardware issue.
2805 */
2806 WARN_ON(!(vmcs_read32(VM_EXIT_REASON) &
2807 VMX_EXIT_REASONS_FAILED_VMENTRY));
2808
2809 return 0;
2810}
2811STACK_FRAME_NON_STANDARD(nested_vmx_check_vmentry_hw);
2812
2813
2814static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
2815 struct vmcs12 *vmcs12);
2816
2817static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
2818{
2819 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
2820 struct vcpu_vmx *vmx = to_vmx(vcpu);
2821 struct page *page;
2822 u64 hpa;
2823
2824 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
2825 /*
2826 * Translate L1 physical address to host physical
2827 * address for vmcs02. Keep the page pinned, so this
2828 * physical address remains valid. We keep a reference
2829 * to it so we can release it later.
2830 */
2831 if (vmx->nested.apic_access_page) { /* shouldn't happen */
2832 kvm_release_page_dirty(vmx->nested.apic_access_page);
2833 vmx->nested.apic_access_page = NULL;
2834 }
2835 page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->apic_access_addr);
2836 /*
2837 * If translation failed, no matter: This feature asks
2838 * to exit when accessing the given address, and if it
2839 * can never be accessed, this feature won't do
2840 * anything anyway.
2841 */
2842 if (!is_error_page(page)) {
2843 vmx->nested.apic_access_page = page;
2844 hpa = page_to_phys(vmx->nested.apic_access_page);
2845 vmcs_write64(APIC_ACCESS_ADDR, hpa);
2846 } else {
2847 vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL,
2848 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
2849 }
2850 }
2851
2852 if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
2853 if (vmx->nested.virtual_apic_page) { /* shouldn't happen */
2854 kvm_release_page_dirty(vmx->nested.virtual_apic_page);
2855 vmx->nested.virtual_apic_page = NULL;
2856 }
2857 page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->virtual_apic_page_addr);
2858
2859 /*
2860 * If translation failed, VM entry will fail because
2861 * prepare_vmcs02 set VIRTUAL_APIC_PAGE_ADDR to -1ull.
2862 * Failing the vm entry is _not_ what the processor
2863 * does but it's basically the only possibility we
2864 * have. We could still enter the guest if CR8 load
2865 * exits are enabled, CR8 store exits are enabled, and
2866 * virtualize APIC access is disabled; in this case
2867 * the processor would never use the TPR shadow and we
2868 * could simply clear the bit from the execution
2869 * control. But such a configuration is useless, so
2870 * let's keep the code simple.
2871 */
2872 if (!is_error_page(page)) {
2873 vmx->nested.virtual_apic_page = page;
2874 hpa = page_to_phys(vmx->nested.virtual_apic_page);
2875 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, hpa);
2876 }
2877 }
2878
2879 if (nested_cpu_has_posted_intr(vmcs12)) {
2880 if (vmx->nested.pi_desc_page) { /* shouldn't happen */
2881 kunmap(vmx->nested.pi_desc_page);
2882 kvm_release_page_dirty(vmx->nested.pi_desc_page);
2883 vmx->nested.pi_desc_page = NULL;
2884 vmx->nested.pi_desc = NULL;
2885 vmcs_write64(POSTED_INTR_DESC_ADDR, -1ull);
2886 }
2887 page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->posted_intr_desc_addr);
2888 if (is_error_page(page))
2889 return;
2890 vmx->nested.pi_desc_page = page;
2891 vmx->nested.pi_desc = kmap(vmx->nested.pi_desc_page);
2892 vmx->nested.pi_desc =
2893 (struct pi_desc *)((void *)vmx->nested.pi_desc +
2894 (unsigned long)(vmcs12->posted_intr_desc_addr &
2895 (PAGE_SIZE - 1)));
2896 vmcs_write64(POSTED_INTR_DESC_ADDR,
2897 page_to_phys(vmx->nested.pi_desc_page) +
2898 (unsigned long)(vmcs12->posted_intr_desc_addr &
2899 (PAGE_SIZE - 1)));
2900 }
2901 if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12))
2902 vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL,
2903 CPU_BASED_USE_MSR_BITMAPS);
2904 else
2905 vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
2906 CPU_BASED_USE_MSR_BITMAPS);
2907}
2908
2909/*
2910 * Intel's VMX Instruction Reference specifies a common set of prerequisites
2911 * for running VMX instructions (except VMXON, whose prerequisites are
2912 * slightly different). It also specifies what exception to inject otherwise.
2913 * Note that many of these exceptions have priority over VM exits, so they
2914 * don't have to be checked again here.
2915 */
2916static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
2917{
2918 if (!to_vmx(vcpu)->nested.vmxon) {
2919 kvm_queue_exception(vcpu, UD_VECTOR);
2920 return 0;
2921 }
2922
2923 if (vmx_get_cpl(vcpu)) {
2924 kvm_inject_gp(vcpu, 0);
2925 return 0;
2926 }
2927
2928 return 1;
2929}
2930
2931static u8 vmx_has_apicv_interrupt(struct kvm_vcpu *vcpu)
2932{
2933 u8 rvi = vmx_get_rvi();
2934 u8 vppr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_PROCPRI);
2935
2936 return ((rvi & 0xf0) > (vppr & 0xf0));
2937}
2938
2939static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
2940 struct vmcs12 *vmcs12);
2941
2942/*
2943 * If from_vmentry is false, this is being called from state restore (either RSM
2944 * or KVM_SET_NESTED_STATE). Otherwise it's called from vmlaunch/vmresume.
2945+ *
2946+ * Returns:
2947+ * 0 - success, i.e. proceed with actual VMEnter
2948+ * 1 - consistency check VMExit
2949+ * -1 - consistency check VMFail
2950 */
2951int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
2952{
2953 struct vcpu_vmx *vmx = to_vmx(vcpu);
2954 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
2955 bool evaluate_pending_interrupts;
2956 u32 exit_reason = EXIT_REASON_INVALID_STATE;
2957 u32 exit_qual;
2958
2959 evaluate_pending_interrupts = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) &
2960 (CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_VIRTUAL_NMI_PENDING);
2961 if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu))
2962 evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu);
2963
2964 if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
2965 vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
2966 if (kvm_mpx_supported() &&
2967 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
2968 vmx->nested.vmcs01_guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
2969
2970 vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02);
2971
2972 prepare_vmcs02_early(vmx, vmcs12);
2973
2974 if (from_vmentry) {
2975 nested_get_vmcs12_pages(vcpu);
2976
2977 if (nested_vmx_check_vmentry_hw(vcpu)) {
2978 vmx_switch_vmcs(vcpu, &vmx->vmcs01);
2979 return -1;
2980 }
2981
2982 if (nested_vmx_check_vmentry_postreqs(vcpu, vmcs12, &exit_qual))
2983 goto vmentry_fail_vmexit;
2984 }
2985
2986 enter_guest_mode(vcpu);
2987 if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
2988 vcpu->arch.tsc_offset += vmcs12->tsc_offset;
2989
2990 if (prepare_vmcs02(vcpu, vmcs12, &exit_qual))
2991 goto vmentry_fail_vmexit_guest_mode;
2992
2993 if (from_vmentry) {
2994 exit_reason = EXIT_REASON_MSR_LOAD_FAIL;
2995 exit_qual = nested_vmx_load_msr(vcpu,
2996 vmcs12->vm_entry_msr_load_addr,
2997 vmcs12->vm_entry_msr_load_count);
2998 if (exit_qual)
2999 goto vmentry_fail_vmexit_guest_mode;
3000 } else {
3001 /*
3002 * The MMU is not initialized to point at the right entities yet and
3003 * "get pages" would need to read data from the guest (i.e. we will
3004 * need to perform gpa to hpa translation). Request a call
3005 * to nested_get_vmcs12_pages before the next VM-entry. The MSRs
3006 * have already been set at vmentry time and should not be reset.
3007 */
3008 kvm_make_request(KVM_REQ_GET_VMCS12_PAGES, vcpu);
3009 }
3010
3011 /*
3012 * If L1 had a pending IRQ/NMI until it executed
3013 * VMLAUNCH/VMRESUME which wasn't delivered because it was
3014 * disallowed (e.g. interrupts disabled), L0 needs to
3015 * evaluate if this pending event should cause an exit from L2
3016 * to L1 or delivered directly to L2 (e.g. In case L1 don't
3017 * intercept EXTERNAL_INTERRUPT).
3018 *
3019 * Usually this would be handled by the processor noticing an
3020 * IRQ/NMI window request, or checking RVI during evaluation of
3021 * pending virtual interrupts. However, this setting was done
3022 * on VMCS01 and now VMCS02 is active instead. Thus, we force L0
3023 * to perform pending event evaluation by requesting a KVM_REQ_EVENT.
3024 */
3025 if (unlikely(evaluate_pending_interrupts))
3026 kvm_make_request(KVM_REQ_EVENT, vcpu);
3027
3028 /*
3029 * Note no nested_vmx_succeed or nested_vmx_fail here. At this point
3030 * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet
3031 * returned as far as L1 is concerned. It will only return (and set
3032 * the success flag) when L2 exits (see nested_vmx_vmexit()).
3033 */
3034 return 0;
3035
3036 /*
3037 * A failed consistency check that leads to a VMExit during L1's
3038 * VMEnter to L2 is a variation of a normal VMexit, as explained in
3039 * 26.7 "VM-entry failures during or after loading guest state".
3040 */
3041vmentry_fail_vmexit_guest_mode:
3042 if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
3043 vcpu->arch.tsc_offset -= vmcs12->tsc_offset;
3044 leave_guest_mode(vcpu);
3045
3046vmentry_fail_vmexit:
3047 vmx_switch_vmcs(vcpu, &vmx->vmcs01);
3048
3049 if (!from_vmentry)
3050 return 1;
3051
3052 load_vmcs12_host_state(vcpu, vmcs12);
3053 vmcs12->vm_exit_reason = exit_reason | VMX_EXIT_REASONS_FAILED_VMENTRY;
3054 vmcs12->exit_qualification = exit_qual;
3055 if (enable_shadow_vmcs || vmx->nested.hv_evmcs)
3056 vmx->nested.need_vmcs12_sync = true;
3057 return 1;
3058}
3059
3060/*
3061 * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1
3062 * for running an L2 nested guest.
3063 */
3064static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
3065{
3066 struct vmcs12 *vmcs12;
3067 struct vcpu_vmx *vmx = to_vmx(vcpu);
3068 u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu);
3069 int ret;
3070
3071 if (!nested_vmx_check_permission(vcpu))
3072 return 1;
3073
3074 if (!nested_vmx_handle_enlightened_vmptrld(vcpu, true))
3075 return 1;
3076
3077 if (!vmx->nested.hv_evmcs && vmx->nested.current_vmptr == -1ull)
3078 return nested_vmx_failInvalid(vcpu);
3079
3080 vmcs12 = get_vmcs12(vcpu);
3081
3082 /*
3083 * Can't VMLAUNCH or VMRESUME a shadow VMCS. Despite the fact
3084 * that there *is* a valid VMCS pointer, RFLAGS.CF is set
3085 * rather than RFLAGS.ZF, and no error number is stored to the
3086 * VM-instruction error field.
3087 */
3088 if (vmcs12->hdr.shadow_vmcs)
3089 return nested_vmx_failInvalid(vcpu);
3090
3091 if (vmx->nested.hv_evmcs) {
3092 copy_enlightened_to_vmcs12(vmx);
3093 /* Enlightened VMCS doesn't have launch state */
3094 vmcs12->launch_state = !launch;
3095 } else if (enable_shadow_vmcs) {
3096 copy_shadow_to_vmcs12(vmx);
3097 }
3098
3099 /*
3100 * The nested entry process starts with enforcing various prerequisites
3101 * on vmcs12 as required by the Intel SDM, and act appropriately when
3102 * they fail: As the SDM explains, some conditions should cause the
3103 * instruction to fail, while others will cause the instruction to seem
3104 * to succeed, but return an EXIT_REASON_INVALID_STATE.
3105 * To speed up the normal (success) code path, we should avoid checking
3106 * for misconfigurations which will anyway be caught by the processor
3107 * when using the merged vmcs02.
3108 */
3109 if (interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS)
3110 return nested_vmx_failValid(vcpu,
3111 VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS);
3112
3113 if (vmcs12->launch_state == launch)
3114 return nested_vmx_failValid(vcpu,
3115 launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS
3116 : VMXERR_VMRESUME_NONLAUNCHED_VMCS);
3117
3118 ret = nested_vmx_check_vmentry_prereqs(vcpu, vmcs12);
3119 if (ret)
3120 return nested_vmx_failValid(vcpu, ret);
3121
3122 /*
3123 * We're finally done with prerequisite checking, and can start with
3124 * the nested entry.
3125 */
3126 vmx->nested.nested_run_pending = 1;
3127 ret = nested_vmx_enter_non_root_mode(vcpu, true);
3128 vmx->nested.nested_run_pending = !ret;
3129 if (ret > 0)
3130 return 1;
3131 else if (ret)
3132 return nested_vmx_failValid(vcpu,
3133 VMXERR_ENTRY_INVALID_CONTROL_FIELD);
3134
3135 /* Hide L1D cache contents from the nested guest. */
3136 vmx->vcpu.arch.l1tf_flush_l1d = true;
3137
3138 /*
3139 * Must happen outside of nested_vmx_enter_non_root_mode() as it will
3140 * also be used as part of restoring nVMX state for
3141 * snapshot restore (migration).
3142 *
3143 * In this flow, it is assumed that vmcs12 cache was
3144 * trasferred as part of captured nVMX state and should
3145 * therefore not be read from guest memory (which may not
3146 * exist on destination host yet).
3147 */
3148 nested_cache_shadow_vmcs12(vcpu, vmcs12);
3149
3150 /*
3151 * If we're entering a halted L2 vcpu and the L2 vcpu won't be
3152 * awakened by event injection or by an NMI-window VM-exit or
3153 * by an interrupt-window VM-exit, halt the vcpu.
3154 */
3155 if ((vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT) &&
3156 !(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) &&
3157 !(vmcs12->cpu_based_vm_exec_control & CPU_BASED_VIRTUAL_NMI_PENDING) &&
3158 !((vmcs12->cpu_based_vm_exec_control & CPU_BASED_VIRTUAL_INTR_PENDING) &&
3159 (vmcs12->guest_rflags & X86_EFLAGS_IF))) {
3160 vmx->nested.nested_run_pending = 0;
3161 return kvm_vcpu_halt(vcpu);
3162 }
3163 return 1;
3164}
3165
3166/*
3167 * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date
3168 * because L2 may have changed some cr0 bits directly (CRO_GUEST_HOST_MASK).
3169 * This function returns the new value we should put in vmcs12.guest_cr0.
3170 * It's not enough to just return the vmcs02 GUEST_CR0. Rather,
3171 * 1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now
3172 * available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0
3173 * didn't trap the bit, because if L1 did, so would L0).
3174 * 2. Bits that L1 asked to trap (and therefore L0 also did) could not have
3175 * been modified by L2, and L1 knows it. So just leave the old value of
3176 * the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0
3177 * isn't relevant, because if L0 traps this bit it can set it to anything.
3178 * 3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have
3179 * changed these bits, and therefore they need to be updated, but L0
3180 * didn't necessarily allow them to be changed in GUEST_CR0 - and rather
3181 * put them in vmcs02 CR0_READ_SHADOW. So take these bits from there.
3182 */
3183static inline unsigned long
3184vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
3185{
3186 return
3187 /*1*/ (vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) |
3188 /*2*/ (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) |
3189 /*3*/ (vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask |
3190 vcpu->arch.cr0_guest_owned_bits));
3191}
3192
3193static inline unsigned long
3194vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
3195{
3196 return
3197 /*1*/ (vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) |
3198 /*2*/ (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) |
3199 /*3*/ (vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask |
3200 vcpu->arch.cr4_guest_owned_bits));
3201}
3202
3203static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu,
3204 struct vmcs12 *vmcs12)
3205{
3206 u32 idt_vectoring;
3207 unsigned int nr;
3208
3209 if (vcpu->arch.exception.injected) {
3210 nr = vcpu->arch.exception.nr;
3211 idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
3212
3213 if (kvm_exception_is_soft(nr)) {
3214 vmcs12->vm_exit_instruction_len =
3215 vcpu->arch.event_exit_inst_len;
3216 idt_vectoring |= INTR_TYPE_SOFT_EXCEPTION;
3217 } else
3218 idt_vectoring |= INTR_TYPE_HARD_EXCEPTION;
3219
3220 if (vcpu->arch.exception.has_error_code) {
3221 idt_vectoring |= VECTORING_INFO_DELIVER_CODE_MASK;
3222 vmcs12->idt_vectoring_error_code =
3223 vcpu->arch.exception.error_code;
3224 }
3225
3226 vmcs12->idt_vectoring_info_field = idt_vectoring;
3227 } else if (vcpu->arch.nmi_injected) {
3228 vmcs12->idt_vectoring_info_field =
3229 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR;
3230 } else if (vcpu->arch.interrupt.injected) {
3231 nr = vcpu->arch.interrupt.nr;
3232 idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
3233
3234 if (vcpu->arch.interrupt.soft) {
3235 idt_vectoring |= INTR_TYPE_SOFT_INTR;
3236 vmcs12->vm_entry_instruction_len =
3237 vcpu->arch.event_exit_inst_len;
3238 } else
3239 idt_vectoring |= INTR_TYPE_EXT_INTR;
3240
3241 vmcs12->idt_vectoring_info_field = idt_vectoring;
3242 }
3243}
3244
3245
3246static void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu)
3247{
3248 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
3249 gfn_t gfn;
3250
3251 /*
3252 * Don't need to mark the APIC access page dirty; it is never
3253 * written to by the CPU during APIC virtualization.
3254 */
3255
3256 if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
3257 gfn = vmcs12->virtual_apic_page_addr >> PAGE_SHIFT;
3258 kvm_vcpu_mark_page_dirty(vcpu, gfn);
3259 }
3260
3261 if (nested_cpu_has_posted_intr(vmcs12)) {
3262 gfn = vmcs12->posted_intr_desc_addr >> PAGE_SHIFT;
3263 kvm_vcpu_mark_page_dirty(vcpu, gfn);
3264 }
3265}
3266
3267static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
3268{
3269 struct vcpu_vmx *vmx = to_vmx(vcpu);
3270 int max_irr;
3271 void *vapic_page;
3272 u16 status;
3273
3274 if (!vmx->nested.pi_desc || !vmx->nested.pi_pending)
3275 return;
3276
3277 vmx->nested.pi_pending = false;
3278 if (!pi_test_and_clear_on(vmx->nested.pi_desc))
3279 return;
3280
3281 max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256);
3282 if (max_irr != 256) {
3283 vapic_page = kmap(vmx->nested.virtual_apic_page);
3284 __kvm_apic_update_irr(vmx->nested.pi_desc->pir,
3285 vapic_page, &max_irr);
3286 kunmap(vmx->nested.virtual_apic_page);
3287
3288 status = vmcs_read16(GUEST_INTR_STATUS);
3289 if ((u8)max_irr > ((u8)status & 0xff)) {
3290 status &= ~0xff;
3291 status |= (u8)max_irr;
3292 vmcs_write16(GUEST_INTR_STATUS, status);
3293 }
3294 }
3295
3296 nested_mark_vmcs12_pages_dirty(vcpu);
3297}
3298
3299static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu,
3300 unsigned long exit_qual)
3301{
3302 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
3303 unsigned int nr = vcpu->arch.exception.nr;
3304 u32 intr_info = nr | INTR_INFO_VALID_MASK;
3305
3306 if (vcpu->arch.exception.has_error_code) {
3307 vmcs12->vm_exit_intr_error_code = vcpu->arch.exception.error_code;
3308 intr_info |= INTR_INFO_DELIVER_CODE_MASK;
3309 }
3310
3311 if (kvm_exception_is_soft(nr))
3312 intr_info |= INTR_TYPE_SOFT_EXCEPTION;
3313 else
3314 intr_info |= INTR_TYPE_HARD_EXCEPTION;
3315
3316 if (!(vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) &&
3317 vmx_get_nmi_mask(vcpu))
3318 intr_info |= INTR_INFO_UNBLOCK_NMI;
3319
3320 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual);
3321}
3322
3323static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr)
3324{
3325 struct vcpu_vmx *vmx = to_vmx(vcpu);
3326 unsigned long exit_qual;
3327 bool block_nested_events =
3328 vmx->nested.nested_run_pending || kvm_event_needs_reinjection(vcpu);
3329
3330 if (vcpu->arch.exception.pending &&
3331 nested_vmx_check_exception(vcpu, &exit_qual)) {
3332 if (block_nested_events)
3333 return -EBUSY;
3334 nested_vmx_inject_exception_vmexit(vcpu, exit_qual);
3335 return 0;
3336 }
3337
3338 if (nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) &&
3339 vmx->nested.preemption_timer_expired) {
3340 if (block_nested_events)
3341 return -EBUSY;
3342 nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0);
3343 return 0;
3344 }
3345
3346 if (vcpu->arch.nmi_pending && nested_exit_on_nmi(vcpu)) {
3347 if (block_nested_events)
3348 return -EBUSY;
3349 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
3350 NMI_VECTOR | INTR_TYPE_NMI_INTR |
3351 INTR_INFO_VALID_MASK, 0);
3352 /*
3353 * The NMI-triggered VM exit counts as injection:
3354 * clear this one and block further NMIs.
3355 */
3356 vcpu->arch.nmi_pending = 0;
3357 vmx_set_nmi_mask(vcpu, true);
3358 return 0;
3359 }
3360
3361 if ((kvm_cpu_has_interrupt(vcpu) || external_intr) &&
3362 nested_exit_on_intr(vcpu)) {
3363 if (block_nested_events)
3364 return -EBUSY;
3365 nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0);
3366 return 0;
3367 }
3368
3369 vmx_complete_nested_posted_interrupt(vcpu);
3370 return 0;
3371}
3372
3373static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu)
3374{
3375 ktime_t remaining =
3376 hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer);
3377 u64 value;
3378
3379 if (ktime_to_ns(remaining) <= 0)
3380 return 0;
3381
3382 value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz;
3383 do_div(value, 1000000);
3384 return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
3385}
3386
3387/*
3388 * Update the guest state fields of vmcs12 to reflect changes that
3389 * occurred while L2 was running. (The "IA-32e mode guest" bit of the
3390 * VM-entry controls is also updated, since this is really a guest
3391 * state bit.)
3392 */
3393static void sync_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
3394{
3395 vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12);
3396 vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12);
3397
3398 vmcs12->guest_rsp = kvm_register_read(vcpu, VCPU_REGS_RSP);
3399 vmcs12->guest_rip = kvm_register_read(vcpu, VCPU_REGS_RIP);
3400 vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS);
3401
3402 vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR);
3403 vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR);
3404 vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR);
3405 vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR);
3406 vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR);
3407 vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR);
3408 vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR);
3409 vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR);
3410 vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT);
3411 vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT);
3412 vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT);
3413 vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT);
3414 vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT);
3415 vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT);
3416 vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT);
3417 vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT);
3418 vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT);
3419 vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT);
3420 vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES);
3421 vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES);
3422 vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES);
3423 vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES);
3424 vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES);
3425 vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES);
3426 vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES);
3427 vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES);
3428 vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE);
3429 vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE);
3430 vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE);
3431 vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE);
3432 vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE);
3433 vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE);
3434 vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE);
3435 vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE);
3436 vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE);
3437 vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
3438
3439 vmcs12->guest_interruptibility_info =
3440 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
3441 vmcs12->guest_pending_dbg_exceptions =
3442 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
3443 if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
3444 vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT;
3445 else
3446 vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE;
3447
3448 if (nested_cpu_has_preemption_timer(vmcs12)) {
3449 if (vmcs12->vm_exit_controls &
3450 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
3451 vmcs12->vmx_preemption_timer_value =
3452 vmx_get_preemption_timer_value(vcpu);
3453 hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer);
3454 }
3455
3456 /*
3457 * In some cases (usually, nested EPT), L2 is allowed to change its
3458 * own CR3 without exiting. If it has changed it, we must keep it.
3459 * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined
3460 * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12.
3461 *
3462 * Additionally, restore L2's PDPTR to vmcs12.
3463 */
3464 if (enable_ept) {
3465 vmcs12->guest_cr3 = vmcs_readl(GUEST_CR3);
3466 vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0);
3467 vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1);
3468 vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2);
3469 vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
3470 }
3471
3472 vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS);
3473
3474 if (nested_cpu_has_vid(vmcs12))
3475 vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS);
3476
3477 vmcs12->vm_entry_controls =
3478 (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) |
3479 (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE);
3480
3481 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) {
3482 kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7);
3483 vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
3484 }
3485
3486 /* TODO: These cannot have changed unless we have MSR bitmaps and
3487 * the relevant bit asks not to trap the change */
3488 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT)
3489 vmcs12->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT);
3490 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER)
3491 vmcs12->guest_ia32_efer = vcpu->arch.efer;
3492 vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
3493 vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
3494 vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
3495 if (kvm_mpx_supported())
3496 vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
3497}
3498
3499/*
3500 * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits
3501 * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12),
3502 * and this function updates it to reflect the changes to the guest state while
3503 * L2 was running (and perhaps made some exits which were handled directly by L0
3504 * without going back to L1), and to reflect the exit reason.
3505 * Note that we do not have to copy here all VMCS fields, just those that
3506 * could have changed by the L2 guest or the exit - i.e., the guest-state and
3507 * exit-information fields only. Other fields are modified by L1 with VMWRITE,
3508 * which already writes to vmcs12 directly.
3509 */
3510static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
3511 u32 exit_reason, u32 exit_intr_info,
3512 unsigned long exit_qualification)
3513{
3514 /* update guest state fields: */
3515 sync_vmcs12(vcpu, vmcs12);
3516
3517 /* update exit information fields: */
3518
3519 vmcs12->vm_exit_reason = exit_reason;
3520 vmcs12->exit_qualification = exit_qualification;
3521 vmcs12->vm_exit_intr_info = exit_intr_info;
3522
3523 vmcs12->idt_vectoring_info_field = 0;
3524 vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
3525 vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
3526
3527 if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) {
3528 vmcs12->launch_state = 1;
3529
3530 /* vm_entry_intr_info_field is cleared on exit. Emulate this
3531 * instead of reading the real value. */
3532 vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK;
3533
3534 /*
3535 * Transfer the event that L0 or L1 may wanted to inject into
3536 * L2 to IDT_VECTORING_INFO_FIELD.
3537 */
3538 vmcs12_save_pending_event(vcpu, vmcs12);
3539
3540 /*
3541 * According to spec, there's no need to store the guest's
3542 * MSRs if the exit is due to a VM-entry failure that occurs
3543 * during or after loading the guest state. Since this exit
3544 * does not fall in that category, we need to save the MSRs.
3545 */
3546 if (nested_vmx_store_msr(vcpu,
3547 vmcs12->vm_exit_msr_store_addr,
3548 vmcs12->vm_exit_msr_store_count))
3549 nested_vmx_abort(vcpu,
3550 VMX_ABORT_SAVE_GUEST_MSR_FAIL);
3551 }
3552
3553 /*
3554 * Drop what we picked up for L2 via vmx_complete_interrupts. It is
3555 * preserved above and would only end up incorrectly in L1.
3556 */
3557 vcpu->arch.nmi_injected = false;
3558 kvm_clear_exception_queue(vcpu);
3559 kvm_clear_interrupt_queue(vcpu);
3560}
3561
3562/*
3563 * A part of what we need to when the nested L2 guest exits and we want to
3564 * run its L1 parent, is to reset L1's guest state to the host state specified
3565 * in vmcs12.
3566 * This function is to be called not only on normal nested exit, but also on
3567 * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry
3568 * Failures During or After Loading Guest State").
3569 * This function should be called when the active VMCS is L1's (vmcs01).
3570 */
3571static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
3572 struct vmcs12 *vmcs12)
3573{
3574 struct kvm_segment seg;
3575 u32 entry_failure_code;
3576
3577 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER)
3578 vcpu->arch.efer = vmcs12->host_ia32_efer;
3579 else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
3580 vcpu->arch.efer |= (EFER_LMA | EFER_LME);
3581 else
3582 vcpu->arch.efer &= ~(EFER_LMA | EFER_LME);
3583 vmx_set_efer(vcpu, vcpu->arch.efer);
3584
3585 kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->host_rsp);
3586 kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->host_rip);
3587 vmx_set_rflags(vcpu, X86_EFLAGS_FIXED);
3588 vmx_set_interrupt_shadow(vcpu, 0);
3589
3590 /*
3591 * Note that calling vmx_set_cr0 is important, even if cr0 hasn't
3592 * actually changed, because vmx_set_cr0 refers to efer set above.
3593 *
3594 * CR0_GUEST_HOST_MASK is already set in the original vmcs01
3595 * (KVM doesn't change it);
3596 */
3597 vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS;
3598 vmx_set_cr0(vcpu, vmcs12->host_cr0);
3599
3600 /* Same as above - no reason to call set_cr4_guest_host_mask(). */
3601 vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
3602 vmx_set_cr4(vcpu, vmcs12->host_cr4);
3603
3604 nested_ept_uninit_mmu_context(vcpu);
3605
3606 /*
3607 * Only PDPTE load can fail as the value of cr3 was checked on entry and
3608 * couldn't have changed.
3609 */
3610 if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &entry_failure_code))
3611 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL);
3612
3613 if (!enable_ept)
3614 vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
3615
3616 /*
3617 * If vmcs01 doesn't use VPID, CPU flushes TLB on every
3618 * VMEntry/VMExit. Thus, no need to flush TLB.
3619 *
3620 * If vmcs12 doesn't use VPID, L1 expects TLB to be
3621 * flushed on every VMEntry/VMExit.
3622 *
3623 * Otherwise, we can preserve TLB entries as long as we are
3624 * able to tag L1 TLB entries differently than L2 TLB entries.
3625 *
3626 * If vmcs12 uses EPT, we need to execute this flush on EPTP01
3627 * and therefore we request the TLB flush to happen only after VMCS EPTP
3628 * has been set by KVM_REQ_LOAD_CR3.
3629 */
3630 if (enable_vpid &&
3631 (!nested_cpu_has_vpid(vmcs12) || !nested_has_guest_tlb_tag(vcpu))) {
3632 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
3633 }
3634
3635 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs);
3636 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp);
3637 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip);
3638 vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base);
3639 vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base);
3640 vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF);
3641 vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF);
3642
3643 /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */
3644 if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS)
3645 vmcs_write64(GUEST_BNDCFGS, 0);
3646
3647 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) {
3648 vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat);
3649 vcpu->arch.pat = vmcs12->host_ia32_pat;
3650 }
3651 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
3652 vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL,
3653 vmcs12->host_ia32_perf_global_ctrl);
3654
3655 /* Set L1 segment info according to Intel SDM
3656 27.5.2 Loading Host Segment and Descriptor-Table Registers */
3657 seg = (struct kvm_segment) {
3658 .base = 0,
3659 .limit = 0xFFFFFFFF,
3660 .selector = vmcs12->host_cs_selector,
3661 .type = 11,
3662 .present = 1,
3663 .s = 1,
3664 .g = 1
3665 };
3666 if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
3667 seg.l = 1;
3668 else
3669 seg.db = 1;
3670 vmx_set_segment(vcpu, &seg, VCPU_SREG_CS);
3671 seg = (struct kvm_segment) {
3672 .base = 0,
3673 .limit = 0xFFFFFFFF,
3674 .type = 3,
3675 .present = 1,
3676 .s = 1,
3677 .db = 1,
3678 .g = 1
3679 };
3680 seg.selector = vmcs12->host_ds_selector;
3681 vmx_set_segment(vcpu, &seg, VCPU_SREG_DS);
3682 seg.selector = vmcs12->host_es_selector;
3683 vmx_set_segment(vcpu, &seg, VCPU_SREG_ES);
3684 seg.selector = vmcs12->host_ss_selector;
3685 vmx_set_segment(vcpu, &seg, VCPU_SREG_SS);
3686 seg.selector = vmcs12->host_fs_selector;
3687 seg.base = vmcs12->host_fs_base;
3688 vmx_set_segment(vcpu, &seg, VCPU_SREG_FS);
3689 seg.selector = vmcs12->host_gs_selector;
3690 seg.base = vmcs12->host_gs_base;
3691 vmx_set_segment(vcpu, &seg, VCPU_SREG_GS);
3692 seg = (struct kvm_segment) {
3693 .base = vmcs12->host_tr_base,
3694 .limit = 0x67,
3695 .selector = vmcs12->host_tr_selector,
3696 .type = 11,
3697 .present = 1
3698 };
3699 vmx_set_segment(vcpu, &seg, VCPU_SREG_TR);
3700
3701 kvm_set_dr(vcpu, 7, 0x400);
3702 vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
3703
3704 if (cpu_has_vmx_msr_bitmap())
3705 vmx_update_msr_bitmap(vcpu);
3706
3707 if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr,
3708 vmcs12->vm_exit_msr_load_count))
3709 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
3710}
3711
3712static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx)
3713{
3714 struct shared_msr_entry *efer_msr;
3715 unsigned int i;
3716
3717 if (vm_entry_controls_get(vmx) & VM_ENTRY_LOAD_IA32_EFER)
3718 return vmcs_read64(GUEST_IA32_EFER);
3719
3720 if (cpu_has_load_ia32_efer())
3721 return host_efer;
3722
3723 for (i = 0; i < vmx->msr_autoload.guest.nr; ++i) {
3724 if (vmx->msr_autoload.guest.val[i].index == MSR_EFER)
3725 return vmx->msr_autoload.guest.val[i].value;
3726 }
3727
3728 efer_msr = find_msr_entry(vmx, MSR_EFER);
3729 if (efer_msr)
3730 return efer_msr->data;
3731
3732 return host_efer;
3733}
3734
3735static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu)
3736{
3737 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
3738 struct vcpu_vmx *vmx = to_vmx(vcpu);
3739 struct vmx_msr_entry g, h;
3740 struct msr_data msr;
3741 gpa_t gpa;
3742 u32 i, j;
3743
3744 vcpu->arch.pat = vmcs_read64(GUEST_IA32_PAT);
3745
3746 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) {
3747 /*
3748 * L1's host DR7 is lost if KVM_GUESTDBG_USE_HW_BP is set
3749 * as vmcs01.GUEST_DR7 contains a userspace defined value
3750 * and vcpu->arch.dr7 is not squirreled away before the
3751 * nested VMENTER (not worth adding a variable in nested_vmx).
3752 */
3753 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
3754 kvm_set_dr(vcpu, 7, DR7_FIXED_1);
3755 else
3756 WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7)));
3757 }
3758
3759 /*
3760 * Note that calling vmx_set_{efer,cr0,cr4} is important as they
3761 * handle a variety of side effects to KVM's software model.
3762 */
3763 vmx_set_efer(vcpu, nested_vmx_get_vmcs01_guest_efer(vmx));
3764
3765 vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS;
3766 vmx_set_cr0(vcpu, vmcs_readl(CR0_READ_SHADOW));
3767
3768 vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
3769 vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW));
3770
3771 nested_ept_uninit_mmu_context(vcpu);
3772 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
3773 __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
3774
3775 /*
3776 * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs
3777 * from vmcs01 (if necessary). The PDPTRs are not loaded on
3778 * VMFail, like everything else we just need to ensure our
3779 * software model is up-to-date.
3780 */
3781 ept_save_pdptrs(vcpu);
3782
3783 kvm_mmu_reset_context(vcpu);
3784
3785 if (cpu_has_vmx_msr_bitmap())
3786 vmx_update_msr_bitmap(vcpu);
3787
3788 /*
3789 * This nasty bit of open coding is a compromise between blindly
3790 * loading L1's MSRs using the exit load lists (incorrect emulation
3791 * of VMFail), leaving the nested VM's MSRs in the software model
3792 * (incorrect behavior) and snapshotting the modified MSRs (too
3793 * expensive since the lists are unbound by hardware). For each
3794 * MSR that was (prematurely) loaded from the nested VMEntry load
3795 * list, reload it from the exit load list if it exists and differs
3796 * from the guest value. The intent is to stuff host state as
3797 * silently as possible, not to fully process the exit load list.
3798 */
3799 msr.host_initiated = false;
3800 for (i = 0; i < vmcs12->vm_entry_msr_load_count; i++) {
3801 gpa = vmcs12->vm_entry_msr_load_addr + (i * sizeof(g));
3802 if (kvm_vcpu_read_guest(vcpu, gpa, &g, sizeof(g))) {
3803 pr_debug_ratelimited(
3804 "%s read MSR index failed (%u, 0x%08llx)\n",
3805 __func__, i, gpa);
3806 goto vmabort;
3807 }
3808
3809 for (j = 0; j < vmcs12->vm_exit_msr_load_count; j++) {
3810 gpa = vmcs12->vm_exit_msr_load_addr + (j * sizeof(h));
3811 if (kvm_vcpu_read_guest(vcpu, gpa, &h, sizeof(h))) {
3812 pr_debug_ratelimited(
3813 "%s read MSR failed (%u, 0x%08llx)\n",
3814 __func__, j, gpa);
3815 goto vmabort;
3816 }
3817 if (h.index != g.index)
3818 continue;
3819 if (h.value == g.value)
3820 break;
3821
3822 if (nested_vmx_load_msr_check(vcpu, &h)) {
3823 pr_debug_ratelimited(
3824 "%s check failed (%u, 0x%x, 0x%x)\n",
3825 __func__, j, h.index, h.reserved);
3826 goto vmabort;
3827 }
3828
3829 msr.index = h.index;
3830 msr.data = h.value;
3831 if (kvm_set_msr(vcpu, &msr)) {
3832 pr_debug_ratelimited(
3833 "%s WRMSR failed (%u, 0x%x, 0x%llx)\n",
3834 __func__, j, h.index, h.value);
3835 goto vmabort;
3836 }
3837 }
3838 }
3839
3840 return;
3841
3842vmabort:
3843 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
3844}
3845
3846/*
3847 * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1
3848 * and modify vmcs12 to make it see what it would expect to see there if
3849 * L2 was its real guest. Must only be called when in L2 (is_guest_mode())
3850 */
3851void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
3852 u32 exit_intr_info, unsigned long exit_qualification)
3853{
3854 struct vcpu_vmx *vmx = to_vmx(vcpu);
3855 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
3856
3857 /* trying to cancel vmlaunch/vmresume is a bug */
3858 WARN_ON_ONCE(vmx->nested.nested_run_pending);
3859
3860 leave_guest_mode(vcpu);
3861
3862 if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
3863 vcpu->arch.tsc_offset -= vmcs12->tsc_offset;
3864
3865 if (likely(!vmx->fail)) {
3866 if (exit_reason == -1)
3867 sync_vmcs12(vcpu, vmcs12);
3868 else
3869 prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info,
3870 exit_qualification);
3871
3872 /*
3873 * Must happen outside of sync_vmcs12() as it will
3874 * also be used to capture vmcs12 cache as part of
3875 * capturing nVMX state for snapshot (migration).
3876 *
3877 * Otherwise, this flush will dirty guest memory at a
3878 * point it is already assumed by user-space to be
3879 * immutable.
3880 */
3881 nested_flush_cached_shadow_vmcs12(vcpu, vmcs12);
3882 } else {
3883 /*
3884 * The only expected VM-instruction error is "VM entry with
3885 * invalid control field(s)." Anything else indicates a
3886 * problem with L0. And we should never get here with a
3887 * VMFail of any type if early consistency checks are enabled.
3888 */
3889 WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) !=
3890 VMXERR_ENTRY_INVALID_CONTROL_FIELD);
3891 WARN_ON_ONCE(nested_early_check);
3892 }
3893
3894 vmx_switch_vmcs(vcpu, &vmx->vmcs01);
3895
3896 /* Update any VMCS fields that might have changed while L2 ran */
3897 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
3898 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
3899 vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
3900
3901 if (kvm_has_tsc_control)
3902 decache_tsc_multiplier(vmx);
3903
3904 if (vmx->nested.change_vmcs01_virtual_apic_mode) {
3905 vmx->nested.change_vmcs01_virtual_apic_mode = false;
3906 vmx_set_virtual_apic_mode(vcpu);
3907 } else if (!nested_cpu_has_ept(vmcs12) &&
3908 nested_cpu_has2(vmcs12,
3909 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
3910 vmx_flush_tlb(vcpu, true);
3911 }
3912
3913 /* This is needed for same reason as it was needed in prepare_vmcs02 */
3914 vmx->host_rsp = 0;
3915
3916 /* Unpin physical memory we referred to in vmcs02 */
3917 if (vmx->nested.apic_access_page) {
3918 kvm_release_page_dirty(vmx->nested.apic_access_page);
3919 vmx->nested.apic_access_page = NULL;
3920 }
3921 if (vmx->nested.virtual_apic_page) {
3922 kvm_release_page_dirty(vmx->nested.virtual_apic_page);
3923 vmx->nested.virtual_apic_page = NULL;
3924 }
3925 if (vmx->nested.pi_desc_page) {
3926 kunmap(vmx->nested.pi_desc_page);
3927 kvm_release_page_dirty(vmx->nested.pi_desc_page);
3928 vmx->nested.pi_desc_page = NULL;
3929 vmx->nested.pi_desc = NULL;
3930 }
3931
3932 /*
3933 * We are now running in L2, mmu_notifier will force to reload the
3934 * page's hpa for L2 vmcs. Need to reload it for L1 before entering L1.
3935 */
3936 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
3937
3938 if ((exit_reason != -1) && (enable_shadow_vmcs || vmx->nested.hv_evmcs))
3939 vmx->nested.need_vmcs12_sync = true;
3940
3941 /* in case we halted in L2 */
3942 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
3943
3944 if (likely(!vmx->fail)) {
3945 /*
3946 * TODO: SDM says that with acknowledge interrupt on
3947 * exit, bit 31 of the VM-exit interrupt information
3948 * (valid interrupt) is always set to 1 on
3949 * EXIT_REASON_EXTERNAL_INTERRUPT, so we shouldn't
3950 * need kvm_cpu_has_interrupt(). See the commit
3951 * message for details.
3952 */
3953 if (nested_exit_intr_ack_set(vcpu) &&
3954 exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT &&
3955 kvm_cpu_has_interrupt(vcpu)) {
3956 int irq = kvm_cpu_get_interrupt(vcpu);
3957 WARN_ON(irq < 0);
3958 vmcs12->vm_exit_intr_info = irq |
3959 INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR;
3960 }
3961
3962 if (exit_reason != -1)
3963 trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason,
3964 vmcs12->exit_qualification,
3965 vmcs12->idt_vectoring_info_field,
3966 vmcs12->vm_exit_intr_info,
3967 vmcs12->vm_exit_intr_error_code,
3968 KVM_ISA_VMX);
3969
3970 load_vmcs12_host_state(vcpu, vmcs12);
3971
3972 return;
3973 }
3974
3975 /*
3976 * After an early L2 VM-entry failure, we're now back
3977 * in L1 which thinks it just finished a VMLAUNCH or
3978 * VMRESUME instruction, so we need to set the failure
3979 * flag and the VM-instruction error field of the VMCS
3980 * accordingly, and skip the emulated instruction.
3981 */
3982 (void)nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
3983
3984 /*
3985 * Restore L1's host state to KVM's software model. We're here
3986 * because a consistency check was caught by hardware, which
3987 * means some amount of guest state has been propagated to KVM's
3988 * model and needs to be unwound to the host's state.
3989 */
3990 nested_vmx_restore_host_state(vcpu);
3991
3992 vmx->fail = 0;
3993}
3994
3995/*
3996 * Decode the memory-address operand of a vmx instruction, as recorded on an
3997 * exit caused by such an instruction (run by a guest hypervisor).
3998 * On success, returns 0. When the operand is invalid, returns 1 and throws
3999 * #UD or #GP.
4000 */
4001int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification,
4002 u32 vmx_instruction_info, bool wr, gva_t *ret)
4003{
4004 gva_t off;
4005 bool exn;
4006 struct kvm_segment s;
4007
4008 /*
4009 * According to Vol. 3B, "Information for VM Exits Due to Instruction
4010 * Execution", on an exit, vmx_instruction_info holds most of the
4011 * addressing components of the operand. Only the displacement part
4012 * is put in exit_qualification (see 3B, "Basic VM-Exit Information").
4013 * For how an actual address is calculated from all these components,
4014 * refer to Vol. 1, "Operand Addressing".
4015 */
4016 int scaling = vmx_instruction_info & 3;
4017 int addr_size = (vmx_instruction_info >> 7) & 7;
4018 bool is_reg = vmx_instruction_info & (1u << 10);
4019 int seg_reg = (vmx_instruction_info >> 15) & 7;
4020 int index_reg = (vmx_instruction_info >> 18) & 0xf;
4021 bool index_is_valid = !(vmx_instruction_info & (1u << 22));
4022 int base_reg = (vmx_instruction_info >> 23) & 0xf;
4023 bool base_is_valid = !(vmx_instruction_info & (1u << 27));
4024
4025 if (is_reg) {
4026 kvm_queue_exception(vcpu, UD_VECTOR);
4027 return 1;
4028 }
4029
4030 /* Addr = segment_base + offset */
4031 /* offset = base + [index * scale] + displacement */
4032 off = exit_qualification; /* holds the displacement */
4033 if (base_is_valid)
4034 off += kvm_register_read(vcpu, base_reg);
4035 if (index_is_valid)
4036 off += kvm_register_read(vcpu, index_reg)<<scaling;
4037 vmx_get_segment(vcpu, &s, seg_reg);
4038 *ret = s.base + off;
4039
4040 if (addr_size == 1) /* 32 bit */
4041 *ret &= 0xffffffff;
4042
4043 /* Checks for #GP/#SS exceptions. */
4044 exn = false;
4045 if (is_long_mode(vcpu)) {
4046 /* Long mode: #GP(0)/#SS(0) if the memory address is in a
4047 * non-canonical form. This is the only check on the memory
4048 * destination for long mode!
4049 */
4050 exn = is_noncanonical_address(*ret, vcpu);
4051 } else if (is_protmode(vcpu)) {
4052 /* Protected mode: apply checks for segment validity in the
4053 * following order:
4054 * - segment type check (#GP(0) may be thrown)
4055 * - usability check (#GP(0)/#SS(0))
4056 * - limit check (#GP(0)/#SS(0))
4057 */
4058 if (wr)
4059 /* #GP(0) if the destination operand is located in a
4060 * read-only data segment or any code segment.
4061 */
4062 exn = ((s.type & 0xa) == 0 || (s.type & 8));
4063 else
4064 /* #GP(0) if the source operand is located in an
4065 * execute-only code segment
4066 */
4067 exn = ((s.type & 0xa) == 8);
4068 if (exn) {
4069 kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
4070 return 1;
4071 }
4072 /* Protected mode: #GP(0)/#SS(0) if the segment is unusable.
4073 */
4074 exn = (s.unusable != 0);
4075 /* Protected mode: #GP(0)/#SS(0) if the memory
4076 * operand is outside the segment limit.
4077 */
4078 exn = exn || (off + sizeof(u64) > s.limit);
4079 }
4080 if (exn) {
4081 kvm_queue_exception_e(vcpu,
4082 seg_reg == VCPU_SREG_SS ?
4083 SS_VECTOR : GP_VECTOR,
4084 0);
4085 return 1;
4086 }
4087
4088 return 0;
4089}
4090
4091static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer)
4092{
4093 gva_t gva;
4094 struct x86_exception e;
4095
4096 if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
4097 vmcs_read32(VMX_INSTRUCTION_INFO), false, &gva))
4098 return 1;
4099
4100 if (kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e)) {
4101 kvm_inject_page_fault(vcpu, &e);
4102 return 1;
4103 }
4104
4105 return 0;
4106}
4107
4108/*
4109 * Allocate a shadow VMCS and associate it with the currently loaded
4110 * VMCS, unless such a shadow VMCS already exists. The newly allocated
4111 * VMCS is also VMCLEARed, so that it is ready for use.
4112 */
4113static struct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu)
4114{
4115 struct vcpu_vmx *vmx = to_vmx(vcpu);
4116 struct loaded_vmcs *loaded_vmcs = vmx->loaded_vmcs;
4117
4118 /*
4119 * We should allocate a shadow vmcs for vmcs01 only when L1
4120 * executes VMXON and free it when L1 executes VMXOFF.
4121 * As it is invalid to execute VMXON twice, we shouldn't reach
4122 * here when vmcs01 already have an allocated shadow vmcs.
4123 */
4124 WARN_ON(loaded_vmcs == &vmx->vmcs01 && loaded_vmcs->shadow_vmcs);
4125
4126 if (!loaded_vmcs->shadow_vmcs) {
4127 loaded_vmcs->shadow_vmcs = alloc_vmcs(true);
4128 if (loaded_vmcs->shadow_vmcs)
4129 vmcs_clear(loaded_vmcs->shadow_vmcs);
4130 }
4131 return loaded_vmcs->shadow_vmcs;
4132}
4133
4134static int enter_vmx_operation(struct kvm_vcpu *vcpu)
4135{
4136 struct vcpu_vmx *vmx = to_vmx(vcpu);
4137 int r;
4138
4139 r = alloc_loaded_vmcs(&vmx->nested.vmcs02);
4140 if (r < 0)
4141 goto out_vmcs02;
4142
4143 vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL);
4144 if (!vmx->nested.cached_vmcs12)
4145 goto out_cached_vmcs12;
4146
4147 vmx->nested.cached_shadow_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL);
4148 if (!vmx->nested.cached_shadow_vmcs12)
4149 goto out_cached_shadow_vmcs12;
4150
4151 if (enable_shadow_vmcs && !alloc_shadow_vmcs(vcpu))
4152 goto out_shadow_vmcs;
4153
4154 hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC,
4155 HRTIMER_MODE_REL_PINNED);
4156 vmx->nested.preemption_timer.function = vmx_preemption_timer_fn;
4157
4158 vmx->nested.vpid02 = allocate_vpid();
4159
4160 vmx->nested.vmcs02_initialized = false;
4161 vmx->nested.vmxon = true;
4162
4163 if (pt_mode == PT_MODE_HOST_GUEST) {
4164 vmx->pt_desc.guest.ctl = 0;
4165 pt_update_intercept_for_msr(vmx);
4166 }
4167
4168 return 0;
4169
4170out_shadow_vmcs:
4171 kfree(vmx->nested.cached_shadow_vmcs12);
4172
4173out_cached_shadow_vmcs12:
4174 kfree(vmx->nested.cached_vmcs12);
4175
4176out_cached_vmcs12:
4177 free_loaded_vmcs(&vmx->nested.vmcs02);
4178
4179out_vmcs02:
4180 return -ENOMEM;
4181}
4182
4183/*
4184 * Emulate the VMXON instruction.
4185 * Currently, we just remember that VMX is active, and do not save or even
4186 * inspect the argument to VMXON (the so-called "VMXON pointer") because we
4187 * do not currently need to store anything in that guest-allocated memory
4188 * region. Consequently, VMCLEAR and VMPTRLD also do not verify that the their
4189 * argument is different from the VMXON pointer (which the spec says they do).
4190 */
4191static int handle_vmon(struct kvm_vcpu *vcpu)
4192{
4193 int ret;
4194 gpa_t vmptr;
4195 struct page *page;
4196 struct vcpu_vmx *vmx = to_vmx(vcpu);
4197 const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED
4198 | FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
4199
4200 /*
4201 * The Intel VMX Instruction Reference lists a bunch of bits that are
4202 * prerequisite to running VMXON, most notably cr4.VMXE must be set to
4203 * 1 (see vmx_set_cr4() for when we allow the guest to set this).
4204 * Otherwise, we should fail with #UD. But most faulting conditions
4205 * have already been checked by hardware, prior to the VM-exit for
4206 * VMXON. We do test guest cr4.VMXE because processor CR4 always has
4207 * that bit set to 1 in non-root mode.
4208 */
4209 if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE)) {
4210 kvm_queue_exception(vcpu, UD_VECTOR);
4211 return 1;
4212 }
4213
4214 /* CPL=0 must be checked manually. */
4215 if (vmx_get_cpl(vcpu)) {
4216 kvm_inject_gp(vcpu, 0);
4217 return 1;
4218 }
4219
4220 if (vmx->nested.vmxon)
4221 return nested_vmx_failValid(vcpu,
4222 VMXERR_VMXON_IN_VMX_ROOT_OPERATION);
4223
4224 if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES)
4225 != VMXON_NEEDED_FEATURES) {
4226 kvm_inject_gp(vcpu, 0);
4227 return 1;
4228 }
4229
4230 if (nested_vmx_get_vmptr(vcpu, &vmptr))
4231 return 1;
4232
4233 /*
4234 * SDM 3: 24.11.5
4235 * The first 4 bytes of VMXON region contain the supported
4236 * VMCS revision identifier
4237 *
4238 * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case;
4239 * which replaces physical address width with 32
4240 */
4241 if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu)))
4242 return nested_vmx_failInvalid(vcpu);
4243
4244 page = kvm_vcpu_gpa_to_page(vcpu, vmptr);
4245 if (is_error_page(page))
4246 return nested_vmx_failInvalid(vcpu);
4247
4248 if (*(u32 *)kmap(page) != VMCS12_REVISION) {
4249 kunmap(page);
4250 kvm_release_page_clean(page);
4251 return nested_vmx_failInvalid(vcpu);
4252 }
4253 kunmap(page);
4254 kvm_release_page_clean(page);
4255
4256 vmx->nested.vmxon_ptr = vmptr;
4257 ret = enter_vmx_operation(vcpu);
4258 if (ret)
4259 return ret;
4260
4261 return nested_vmx_succeed(vcpu);
4262}
4263
4264static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu)
4265{
4266 struct vcpu_vmx *vmx = to_vmx(vcpu);
4267
4268 if (vmx->nested.current_vmptr == -1ull)
4269 return;
4270
4271 if (enable_shadow_vmcs) {
4272 /* copy to memory all shadowed fields in case
4273 they were modified */
4274 copy_shadow_to_vmcs12(vmx);
4275 vmx->nested.need_vmcs12_sync = false;
4276 vmx_disable_shadow_vmcs(vmx);
4277 }
4278 vmx->nested.posted_intr_nv = -1;
4279
4280 /* Flush VMCS12 to guest memory */
4281 kvm_vcpu_write_guest_page(vcpu,
4282 vmx->nested.current_vmptr >> PAGE_SHIFT,
4283 vmx->nested.cached_vmcs12, 0, VMCS12_SIZE);
4284
4285 kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
4286
4287 vmx->nested.current_vmptr = -1ull;
4288}
4289
4290/* Emulate the VMXOFF instruction */
4291static int handle_vmoff(struct kvm_vcpu *vcpu)
4292{
4293 if (!nested_vmx_check_permission(vcpu))
4294 return 1;
4295 free_nested(vcpu);
4296 return nested_vmx_succeed(vcpu);
4297}
4298
4299/* Emulate the VMCLEAR instruction */
4300static int handle_vmclear(struct kvm_vcpu *vcpu)
4301{
4302 struct vcpu_vmx *vmx = to_vmx(vcpu);
4303 u32 zero = 0;
4304 gpa_t vmptr;
4305
4306 if (!nested_vmx_check_permission(vcpu))
4307 return 1;
4308
4309 if (nested_vmx_get_vmptr(vcpu, &vmptr))
4310 return 1;
4311
4312 if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu)))
4313 return nested_vmx_failValid(vcpu,
4314 VMXERR_VMCLEAR_INVALID_ADDRESS);
4315
4316 if (vmptr == vmx->nested.vmxon_ptr)
4317 return nested_vmx_failValid(vcpu,
4318 VMXERR_VMCLEAR_VMXON_POINTER);
4319
4320 if (vmx->nested.hv_evmcs_page) {
4321 if (vmptr == vmx->nested.hv_evmcs_vmptr)
4322 nested_release_evmcs(vcpu);
4323 } else {
4324 if (vmptr == vmx->nested.current_vmptr)
4325 nested_release_vmcs12(vcpu);
4326
4327 kvm_vcpu_write_guest(vcpu,
4328 vmptr + offsetof(struct vmcs12,
4329 launch_state),
4330 &zero, sizeof(zero));
4331 }
4332
4333 return nested_vmx_succeed(vcpu);
4334}
4335
4336static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch);
4337
4338/* Emulate the VMLAUNCH instruction */
4339static int handle_vmlaunch(struct kvm_vcpu *vcpu)
4340{
4341 return nested_vmx_run(vcpu, true);
4342}
4343
4344/* Emulate the VMRESUME instruction */
4345static int handle_vmresume(struct kvm_vcpu *vcpu)
4346{
4347
4348 return nested_vmx_run(vcpu, false);
4349}
4350
4351static int handle_vmread(struct kvm_vcpu *vcpu)
4352{
4353 unsigned long field;
4354 u64 field_value;
4355 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
4356 u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
4357 gva_t gva = 0;
4358 struct vmcs12 *vmcs12;
4359
4360 if (!nested_vmx_check_permission(vcpu))
4361 return 1;
4362
4363 if (to_vmx(vcpu)->nested.current_vmptr == -1ull)
4364 return nested_vmx_failInvalid(vcpu);
4365
4366 if (!is_guest_mode(vcpu))
4367 vmcs12 = get_vmcs12(vcpu);
4368 else {
4369 /*
4370 * When vmcs->vmcs_link_pointer is -1ull, any VMREAD
4371 * to shadowed-field sets the ALU flags for VMfailInvalid.
4372 */
4373 if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull)
4374 return nested_vmx_failInvalid(vcpu);
4375 vmcs12 = get_shadow_vmcs12(vcpu);
4376 }
4377
4378 /* Decode instruction info and find the field to read */
4379 field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
4380 /* Read the field, zero-extended to a u64 field_value */
4381 if (vmcs12_read_any(vmcs12, field, &field_value) < 0)
4382 return nested_vmx_failValid(vcpu,
4383 VMXERR_UNSUPPORTED_VMCS_COMPONENT);
4384
4385 /*
4386 * Now copy part of this value to register or memory, as requested.
4387 * Note that the number of bits actually copied is 32 or 64 depending
4388 * on the guest's mode (32 or 64 bit), not on the given field's length.
4389 */
4390 if (vmx_instruction_info & (1u << 10)) {
4391 kvm_register_writel(vcpu, (((vmx_instruction_info) >> 3) & 0xf),
4392 field_value);
4393 } else {
4394 if (get_vmx_mem_address(vcpu, exit_qualification,
4395 vmx_instruction_info, true, &gva))
4396 return 1;
4397 /* _system ok, nested_vmx_check_permission has verified cpl=0 */
4398 kvm_write_guest_virt_system(vcpu, gva, &field_value,
4399 (is_long_mode(vcpu) ? 8 : 4), NULL);
4400 }
4401
4402 return nested_vmx_succeed(vcpu);
4403}
4404
4405
4406static int handle_vmwrite(struct kvm_vcpu *vcpu)
4407{
4408 unsigned long field;
4409 gva_t gva;
4410 struct vcpu_vmx *vmx = to_vmx(vcpu);
4411 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
4412 u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
4413
4414 /* The value to write might be 32 or 64 bits, depending on L1's long
4415 * mode, and eventually we need to write that into a field of several
4416 * possible lengths. The code below first zero-extends the value to 64
4417 * bit (field_value), and then copies only the appropriate number of
4418 * bits into the vmcs12 field.
4419 */
4420 u64 field_value = 0;
4421 struct x86_exception e;
4422 struct vmcs12 *vmcs12;
4423
4424 if (!nested_vmx_check_permission(vcpu))
4425 return 1;
4426
4427 if (vmx->nested.current_vmptr == -1ull)
4428 return nested_vmx_failInvalid(vcpu);
4429
4430 if (vmx_instruction_info & (1u << 10))
4431 field_value = kvm_register_readl(vcpu,
4432 (((vmx_instruction_info) >> 3) & 0xf));
4433 else {
4434 if (get_vmx_mem_address(vcpu, exit_qualification,
4435 vmx_instruction_info, false, &gva))
4436 return 1;
4437 if (kvm_read_guest_virt(vcpu, gva, &field_value,
4438 (is_64_bit_mode(vcpu) ? 8 : 4), &e)) {
4439 kvm_inject_page_fault(vcpu, &e);
4440 return 1;
4441 }
4442 }
4443
4444
4445 field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
4446 /*
4447 * If the vCPU supports "VMWRITE to any supported field in the
4448 * VMCS," then the "read-only" fields are actually read/write.
4449 */
4450 if (vmcs_field_readonly(field) &&
4451 !nested_cpu_has_vmwrite_any_field(vcpu))
4452 return nested_vmx_failValid(vcpu,
4453 VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT);
4454
4455 if (!is_guest_mode(vcpu))
4456 vmcs12 = get_vmcs12(vcpu);
4457 else {
4458 /*
4459 * When vmcs->vmcs_link_pointer is -1ull, any VMWRITE
4460 * to shadowed-field sets the ALU flags for VMfailInvalid.
4461 */
4462 if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull)
4463 return nested_vmx_failInvalid(vcpu);
4464 vmcs12 = get_shadow_vmcs12(vcpu);
4465 }
4466
4467 if (vmcs12_write_any(vmcs12, field, field_value) < 0)
4468 return nested_vmx_failValid(vcpu,
4469 VMXERR_UNSUPPORTED_VMCS_COMPONENT);
4470
4471 /*
4472 * Do not track vmcs12 dirty-state if in guest-mode
4473 * as we actually dirty shadow vmcs12 instead of vmcs12.
4474 */
4475 if (!is_guest_mode(vcpu)) {
4476 switch (field) {
4477#define SHADOW_FIELD_RW(x) case x:
4478#include "vmcs_shadow_fields.h"
4479 /*
4480 * The fields that can be updated by L1 without a vmexit are
4481 * always updated in the vmcs02, the others go down the slow
4482 * path of prepare_vmcs02.
4483 */
4484 break;
4485 default:
4486 vmx->nested.dirty_vmcs12 = true;
4487 break;
4488 }
4489 }
4490
4491 return nested_vmx_succeed(vcpu);
4492}
4493
4494static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr)
4495{
4496 vmx->nested.current_vmptr = vmptr;
4497 if (enable_shadow_vmcs) {
4498 vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
4499 SECONDARY_EXEC_SHADOW_VMCS);
4500 vmcs_write64(VMCS_LINK_POINTER,
4501 __pa(vmx->vmcs01.shadow_vmcs));
4502 vmx->nested.need_vmcs12_sync = true;
4503 }
4504 vmx->nested.dirty_vmcs12 = true;
4505}
4506
4507/* Emulate the VMPTRLD instruction */
4508static int handle_vmptrld(struct kvm_vcpu *vcpu)
4509{
4510 struct vcpu_vmx *vmx = to_vmx(vcpu);
4511 gpa_t vmptr;
4512
4513 if (!nested_vmx_check_permission(vcpu))
4514 return 1;
4515
4516 if (nested_vmx_get_vmptr(vcpu, &vmptr))
4517 return 1;
4518
4519 if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu)))
4520 return nested_vmx_failValid(vcpu,
4521 VMXERR_VMPTRLD_INVALID_ADDRESS);
4522
4523 if (vmptr == vmx->nested.vmxon_ptr)
4524 return nested_vmx_failValid(vcpu,
4525 VMXERR_VMPTRLD_VMXON_POINTER);
4526
4527 /* Forbid normal VMPTRLD if Enlightened version was used */
4528 if (vmx->nested.hv_evmcs)
4529 return 1;
4530
4531 if (vmx->nested.current_vmptr != vmptr) {
4532 struct vmcs12 *new_vmcs12;
4533 struct page *page;
4534
4535 page = kvm_vcpu_gpa_to_page(vcpu, vmptr);
4536 if (is_error_page(page)) {
4537 /*
4538 * Reads from an unbacked page return all 1s,
4539 * which means that the 32 bits located at the
4540 * given physical address won't match the required
4541 * VMCS12_REVISION identifier.
4542 */
4543 nested_vmx_failValid(vcpu,
4544 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
4545 return kvm_skip_emulated_instruction(vcpu);
4546 }
4547 new_vmcs12 = kmap(page);
4548 if (new_vmcs12->hdr.revision_id != VMCS12_REVISION ||
4549 (new_vmcs12->hdr.shadow_vmcs &&
4550 !nested_cpu_has_vmx_shadow_vmcs(vcpu))) {
4551 kunmap(page);
4552 kvm_release_page_clean(page);
4553 return nested_vmx_failValid(vcpu,
4554 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
4555 }
4556
4557 nested_release_vmcs12(vcpu);
4558
4559 /*
4560 * Load VMCS12 from guest memory since it is not already
4561 * cached.
4562 */
4563 memcpy(vmx->nested.cached_vmcs12, new_vmcs12, VMCS12_SIZE);
4564 kunmap(page);
4565 kvm_release_page_clean(page);
4566
4567 set_current_vmptr(vmx, vmptr);
4568 }
4569
4570 return nested_vmx_succeed(vcpu);
4571}
4572
4573/* Emulate the VMPTRST instruction */
4574static int handle_vmptrst(struct kvm_vcpu *vcpu)
4575{
4576 unsigned long exit_qual = vmcs_readl(EXIT_QUALIFICATION);
4577 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO);
4578 gpa_t current_vmptr = to_vmx(vcpu)->nested.current_vmptr;
4579 struct x86_exception e;
4580 gva_t gva;
4581
4582 if (!nested_vmx_check_permission(vcpu))
4583 return 1;
4584
4585 if (unlikely(to_vmx(vcpu)->nested.hv_evmcs))
4586 return 1;
4587
4588 if (get_vmx_mem_address(vcpu, exit_qual, instr_info, true, &gva))
4589 return 1;
4590 /* *_system ok, nested_vmx_check_permission has verified cpl=0 */
4591 if (kvm_write_guest_virt_system(vcpu, gva, (void *)&current_vmptr,
4592 sizeof(gpa_t), &e)) {
4593 kvm_inject_page_fault(vcpu, &e);
4594 return 1;
4595 }
4596 return nested_vmx_succeed(vcpu);
4597}
4598
4599/* Emulate the INVEPT instruction */
4600static int handle_invept(struct kvm_vcpu *vcpu)
4601{
4602 struct vcpu_vmx *vmx = to_vmx(vcpu);
4603 u32 vmx_instruction_info, types;
4604 unsigned long type;
4605 gva_t gva;
4606 struct x86_exception e;
4607 struct {
4608 u64 eptp, gpa;
4609 } operand;
4610
4611 if (!(vmx->nested.msrs.secondary_ctls_high &
4612 SECONDARY_EXEC_ENABLE_EPT) ||
4613 !(vmx->nested.msrs.ept_caps & VMX_EPT_INVEPT_BIT)) {
4614 kvm_queue_exception(vcpu, UD_VECTOR);
4615 return 1;
4616 }
4617
4618 if (!nested_vmx_check_permission(vcpu))
4619 return 1;
4620
4621 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
4622 type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
4623
4624 types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6;
4625
4626 if (type >= 32 || !(types & (1 << type)))
4627 return nested_vmx_failValid(vcpu,
4628 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
4629
4630 /* According to the Intel VMX instruction reference, the memory
4631 * operand is read even if it isn't needed (e.g., for type==global)
4632 */
4633 if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
4634 vmx_instruction_info, false, &gva))
4635 return 1;
4636 if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) {
4637 kvm_inject_page_fault(vcpu, &e);
4638 return 1;
4639 }
4640
4641 switch (type) {
4642 case VMX_EPT_EXTENT_GLOBAL:
4643 /*
4644 * TODO: track mappings and invalidate
4645 * single context requests appropriately
4646 */
4647 case VMX_EPT_EXTENT_CONTEXT:
4648 kvm_mmu_sync_roots(vcpu);
4649 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
4650 break;
4651 default:
4652 BUG_ON(1);
4653 break;
4654 }
4655
4656 return nested_vmx_succeed(vcpu);
4657}
4658
4659static int handle_invvpid(struct kvm_vcpu *vcpu)
4660{
4661 struct vcpu_vmx *vmx = to_vmx(vcpu);
4662 u32 vmx_instruction_info;
4663 unsigned long type, types;
4664 gva_t gva;
4665 struct x86_exception e;
4666 struct {
4667 u64 vpid;
4668 u64 gla;
4669 } operand;
4670 u16 vpid02;
4671
4672 if (!(vmx->nested.msrs.secondary_ctls_high &
4673 SECONDARY_EXEC_ENABLE_VPID) ||
4674 !(vmx->nested.msrs.vpid_caps & VMX_VPID_INVVPID_BIT)) {
4675 kvm_queue_exception(vcpu, UD_VECTOR);
4676 return 1;
4677 }
4678
4679 if (!nested_vmx_check_permission(vcpu))
4680 return 1;
4681
4682 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
4683 type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
4684
4685 types = (vmx->nested.msrs.vpid_caps &
4686 VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8;
4687
4688 if (type >= 32 || !(types & (1 << type)))
4689 return nested_vmx_failValid(vcpu,
4690 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
4691
4692 /* according to the intel vmx instruction reference, the memory
4693 * operand is read even if it isn't needed (e.g., for type==global)
4694 */
4695 if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
4696 vmx_instruction_info, false, &gva))
4697 return 1;
4698 if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) {
4699 kvm_inject_page_fault(vcpu, &e);
4700 return 1;
4701 }
4702 if (operand.vpid >> 16)
4703 return nested_vmx_failValid(vcpu,
4704 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
4705
4706 vpid02 = nested_get_vpid02(vcpu);
4707 switch (type) {
4708 case VMX_VPID_EXTENT_INDIVIDUAL_ADDR:
4709 if (!operand.vpid ||
4710 is_noncanonical_address(operand.gla, vcpu))
4711 return nested_vmx_failValid(vcpu,
4712 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
4713 if (cpu_has_vmx_invvpid_individual_addr()) {
4714 __invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR,
4715 vpid02, operand.gla);
4716 } else
4717 __vmx_flush_tlb(vcpu, vpid02, false);
4718 break;
4719 case VMX_VPID_EXTENT_SINGLE_CONTEXT:
4720 case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL:
4721 if (!operand.vpid)
4722 return nested_vmx_failValid(vcpu,
4723 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
4724 __vmx_flush_tlb(vcpu, vpid02, false);
4725 break;
4726 case VMX_VPID_EXTENT_ALL_CONTEXT:
4727 __vmx_flush_tlb(vcpu, vpid02, false);
4728 break;
4729 default:
4730 WARN_ON_ONCE(1);
4731 return kvm_skip_emulated_instruction(vcpu);
4732 }
4733
4734 return nested_vmx_succeed(vcpu);
4735}
4736
4737static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu,
4738 struct vmcs12 *vmcs12)
4739{
4740 u32 index = vcpu->arch.regs[VCPU_REGS_RCX];
4741 u64 address;
4742 bool accessed_dirty;
4743 struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
4744
4745 if (!nested_cpu_has_eptp_switching(vmcs12) ||
4746 !nested_cpu_has_ept(vmcs12))
4747 return 1;
4748
4749 if (index >= VMFUNC_EPTP_ENTRIES)
4750 return 1;
4751
4752
4753 if (kvm_vcpu_read_guest_page(vcpu, vmcs12->eptp_list_address >> PAGE_SHIFT,
4754 &address, index * 8, 8))
4755 return 1;
4756
4757 accessed_dirty = !!(address & VMX_EPTP_AD_ENABLE_BIT);
4758
4759 /*
4760 * If the (L2) guest does a vmfunc to the currently
4761 * active ept pointer, we don't have to do anything else
4762 */
4763 if (vmcs12->ept_pointer != address) {
4764 if (!valid_ept_address(vcpu, address))
4765 return 1;
4766
4767 kvm_mmu_unload(vcpu);
4768 mmu->ept_ad = accessed_dirty;
4769 mmu->mmu_role.base.ad_disabled = !accessed_dirty;
4770 vmcs12->ept_pointer = address;
4771 /*
4772 * TODO: Check what's the correct approach in case
4773 * mmu reload fails. Currently, we just let the next
4774 * reload potentially fail
4775 */
4776 kvm_mmu_reload(vcpu);
4777 }
4778
4779 return 0;
4780}
4781
4782static int handle_vmfunc(struct kvm_vcpu *vcpu)
4783{
4784 struct vcpu_vmx *vmx = to_vmx(vcpu);
4785 struct vmcs12 *vmcs12;
4786 u32 function = vcpu->arch.regs[VCPU_REGS_RAX];
4787
4788 /*
4789 * VMFUNC is only supported for nested guests, but we always enable the
4790 * secondary control for simplicity; for non-nested mode, fake that we
4791 * didn't by injecting #UD.
4792 */
4793 if (!is_guest_mode(vcpu)) {
4794 kvm_queue_exception(vcpu, UD_VECTOR);
4795 return 1;
4796 }
4797
4798 vmcs12 = get_vmcs12(vcpu);
4799 if ((vmcs12->vm_function_control & (1 << function)) == 0)
4800 goto fail;
4801
4802 switch (function) {
4803 case 0:
4804 if (nested_vmx_eptp_switching(vcpu, vmcs12))
4805 goto fail;
4806 break;
4807 default:
4808 goto fail;
4809 }
4810 return kvm_skip_emulated_instruction(vcpu);
4811
4812fail:
4813 nested_vmx_vmexit(vcpu, vmx->exit_reason,
4814 vmcs_read32(VM_EXIT_INTR_INFO),
4815 vmcs_readl(EXIT_QUALIFICATION));
4816 return 1;
4817}
4818
4819
4820static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu,
4821 struct vmcs12 *vmcs12)
4822{
4823 unsigned long exit_qualification;
4824 gpa_t bitmap, last_bitmap;
4825 unsigned int port;
4826 int size;
4827 u8 b;
4828
4829 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
4830 return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING);
4831
4832 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
4833
4834 port = exit_qualification >> 16;
4835 size = (exit_qualification & 7) + 1;
4836
4837 last_bitmap = (gpa_t)-1;
4838 b = -1;
4839
4840 while (size > 0) {
4841 if (port < 0x8000)
4842 bitmap = vmcs12->io_bitmap_a;
4843 else if (port < 0x10000)
4844 bitmap = vmcs12->io_bitmap_b;
4845 else
4846 return true;
4847 bitmap += (port & 0x7fff) / 8;
4848
4849 if (last_bitmap != bitmap)
4850 if (kvm_vcpu_read_guest(vcpu, bitmap, &b, 1))
4851 return true;
4852 if (b & (1 << (port & 7)))
4853 return true;
4854
4855 port++;
4856 size--;
4857 last_bitmap = bitmap;
4858 }
4859
4860 return false;
4861}
4862
4863/*
4864 * Return 1 if we should exit from L2 to L1 to handle an MSR access access,
4865 * rather than handle it ourselves in L0. I.e., check whether L1 expressed
4866 * disinterest in the current event (read or write a specific MSR) by using an
4867 * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps.
4868 */
4869static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
4870 struct vmcs12 *vmcs12, u32 exit_reason)
4871{
4872 u32 msr_index = vcpu->arch.regs[VCPU_REGS_RCX];
4873 gpa_t bitmap;
4874
4875 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
4876 return true;
4877
4878 /*
4879 * The MSR_BITMAP page is divided into four 1024-byte bitmaps,
4880 * for the four combinations of read/write and low/high MSR numbers.
4881 * First we need to figure out which of the four to use:
4882 */
4883 bitmap = vmcs12->msr_bitmap;
4884 if (exit_reason == EXIT_REASON_MSR_WRITE)
4885 bitmap += 2048;
4886 if (msr_index >= 0xc0000000) {
4887 msr_index -= 0xc0000000;
4888 bitmap += 1024;
4889 }
4890
4891 /* Then read the msr_index'th bit from this bitmap: */
4892 if (msr_index < 1024*8) {
4893 unsigned char b;
4894 if (kvm_vcpu_read_guest(vcpu, bitmap + msr_index/8, &b, 1))
4895 return true;
4896 return 1 & (b >> (msr_index & 7));
4897 } else
4898 return true; /* let L1 handle the wrong parameter */
4899}
4900
4901/*
4902 * Return 1 if we should exit from L2 to L1 to handle a CR access exit,
4903 * rather than handle it ourselves in L0. I.e., check if L1 wanted to
4904 * intercept (via guest_host_mask etc.) the current event.
4905 */
4906static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
4907 struct vmcs12 *vmcs12)
4908{
4909 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
4910 int cr = exit_qualification & 15;
4911 int reg;
4912 unsigned long val;
4913
4914 switch ((exit_qualification >> 4) & 3) {
4915 case 0: /* mov to cr */
4916 reg = (exit_qualification >> 8) & 15;
4917 val = kvm_register_readl(vcpu, reg);
4918 switch (cr) {
4919 case 0:
4920 if (vmcs12->cr0_guest_host_mask &
4921 (val ^ vmcs12->cr0_read_shadow))
4922 return true;
4923 break;
4924 case 3:
4925 if ((vmcs12->cr3_target_count >= 1 &&
4926 vmcs12->cr3_target_value0 == val) ||
4927 (vmcs12->cr3_target_count >= 2 &&
4928 vmcs12->cr3_target_value1 == val) ||
4929 (vmcs12->cr3_target_count >= 3 &&
4930 vmcs12->cr3_target_value2 == val) ||
4931 (vmcs12->cr3_target_count >= 4 &&
4932 vmcs12->cr3_target_value3 == val))
4933 return false;
4934 if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING))
4935 return true;
4936 break;
4937 case 4:
4938 if (vmcs12->cr4_guest_host_mask &
4939 (vmcs12->cr4_read_shadow ^ val))
4940 return true;
4941 break;
4942 case 8:
4943 if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING))
4944 return true;
4945 break;
4946 }
4947 break;
4948 case 2: /* clts */
4949 if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) &&
4950 (vmcs12->cr0_read_shadow & X86_CR0_TS))
4951 return true;
4952 break;
4953 case 1: /* mov from cr */
4954 switch (cr) {
4955 case 3:
4956 if (vmcs12->cpu_based_vm_exec_control &
4957 CPU_BASED_CR3_STORE_EXITING)
4958 return true;
4959 break;
4960 case 8:
4961 if (vmcs12->cpu_based_vm_exec_control &
4962 CPU_BASED_CR8_STORE_EXITING)
4963 return true;
4964 break;
4965 }
4966 break;
4967 case 3: /* lmsw */
4968 /*
4969 * lmsw can change bits 1..3 of cr0, and only set bit 0 of
4970 * cr0. Other attempted changes are ignored, with no exit.
4971 */
4972 val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;
4973 if (vmcs12->cr0_guest_host_mask & 0xe &
4974 (val ^ vmcs12->cr0_read_shadow))
4975 return true;
4976 if ((vmcs12->cr0_guest_host_mask & 0x1) &&
4977 !(vmcs12->cr0_read_shadow & 0x1) &&
4978 (val & 0x1))
4979 return true;
4980 break;
4981 }
4982 return false;
4983}
4984
4985static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu,
4986 struct vmcs12 *vmcs12, gpa_t bitmap)
4987{
4988 u32 vmx_instruction_info;
4989 unsigned long field;
4990 u8 b;
4991
4992 if (!nested_cpu_has_shadow_vmcs(vmcs12))
4993 return true;
4994
4995 /* Decode instruction info and find the field to access */
4996 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
4997 field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
4998
4999 /* Out-of-range fields always cause a VM exit from L2 to L1 */
5000 if (field >> 15)
5001 return true;
5002
5003 if (kvm_vcpu_read_guest(vcpu, bitmap + field/8, &b, 1))
5004 return true;
5005
5006 return 1 & (b >> (field & 7));
5007}
5008
5009/*
5010 * Return 1 if we should exit from L2 to L1 to handle an exit, or 0 if we
5011 * should handle it ourselves in L0 (and then continue L2). Only call this
5012 * when in is_guest_mode (L2).
5013 */
5014bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
5015{
5016 u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
5017 struct vcpu_vmx *vmx = to_vmx(vcpu);
5018 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
5019
5020 if (vmx->nested.nested_run_pending)
5021 return false;
5022
5023 if (unlikely(vmx->fail)) {
5024 pr_info_ratelimited("%s failed vm entry %x\n", __func__,
5025 vmcs_read32(VM_INSTRUCTION_ERROR));
5026 return true;
5027 }
5028
5029 /*
5030 * The host physical addresses of some pages of guest memory
5031 * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC
5032 * Page). The CPU may write to these pages via their host
5033 * physical address while L2 is running, bypassing any
5034 * address-translation-based dirty tracking (e.g. EPT write
5035 * protection).
5036 *
5037 * Mark them dirty on every exit from L2 to prevent them from
5038 * getting out of sync with dirty tracking.
5039 */
5040 nested_mark_vmcs12_pages_dirty(vcpu);
5041
5042 trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason,
5043 vmcs_readl(EXIT_QUALIFICATION),
5044 vmx->idt_vectoring_info,
5045 intr_info,
5046 vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
5047 KVM_ISA_VMX);
5048
5049 switch (exit_reason) {
5050 case EXIT_REASON_EXCEPTION_NMI:
5051 if (is_nmi(intr_info))
5052 return false;
5053 else if (is_page_fault(intr_info))
5054 return !vmx->vcpu.arch.apf.host_apf_reason && enable_ept;
5055 else if (is_debug(intr_info) &&
5056 vcpu->guest_debug &
5057 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
5058 return false;
5059 else if (is_breakpoint(intr_info) &&
5060 vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
5061 return false;
5062 return vmcs12->exception_bitmap &
5063 (1u << (intr_info & INTR_INFO_VECTOR_MASK));
5064 case EXIT_REASON_EXTERNAL_INTERRUPT:
5065 return false;
5066 case EXIT_REASON_TRIPLE_FAULT:
5067 return true;
5068 case EXIT_REASON_PENDING_INTERRUPT:
5069 return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_INTR_PENDING);
5070 case EXIT_REASON_NMI_WINDOW:
5071 return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING);
5072 case EXIT_REASON_TASK_SWITCH:
5073 return true;
5074 case EXIT_REASON_CPUID:
5075 return true;
5076 case EXIT_REASON_HLT:
5077 return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING);
5078 case EXIT_REASON_INVD:
5079 return true;
5080 case EXIT_REASON_INVLPG:
5081 return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING);
5082 case EXIT_REASON_RDPMC:
5083 return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING);
5084 case EXIT_REASON_RDRAND:
5085 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND_EXITING);
5086 case EXIT_REASON_RDSEED:
5087 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING);
5088 case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP:
5089 return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING);
5090 case EXIT_REASON_VMREAD:
5091 return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12,
5092 vmcs12->vmread_bitmap);
5093 case EXIT_REASON_VMWRITE:
5094 return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12,
5095 vmcs12->vmwrite_bitmap);
5096 case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR:
5097 case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD:
5098 case EXIT_REASON_VMPTRST: case EXIT_REASON_VMRESUME:
5099 case EXIT_REASON_VMOFF: case EXIT_REASON_VMON:
5100 case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID:
5101 /*
5102 * VMX instructions trap unconditionally. This allows L1 to
5103 * emulate them for its L2 guest, i.e., allows 3-level nesting!
5104 */
5105 return true;
5106 case EXIT_REASON_CR_ACCESS:
5107 return nested_vmx_exit_handled_cr(vcpu, vmcs12);
5108 case EXIT_REASON_DR_ACCESS:
5109 return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING);
5110 case EXIT_REASON_IO_INSTRUCTION:
5111 return nested_vmx_exit_handled_io(vcpu, vmcs12);
5112 case EXIT_REASON_GDTR_IDTR: case EXIT_REASON_LDTR_TR:
5113 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC);
5114 case EXIT_REASON_MSR_READ:
5115 case EXIT_REASON_MSR_WRITE:
5116 return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason);
5117 case EXIT_REASON_INVALID_STATE:
5118 return true;
5119 case EXIT_REASON_MWAIT_INSTRUCTION:
5120 return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING);
5121 case EXIT_REASON_MONITOR_TRAP_FLAG:
5122 return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_TRAP_FLAG);
5123 case EXIT_REASON_MONITOR_INSTRUCTION:
5124 return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING);
5125 case EXIT_REASON_PAUSE_INSTRUCTION:
5126 return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) ||
5127 nested_cpu_has2(vmcs12,
5128 SECONDARY_EXEC_PAUSE_LOOP_EXITING);
5129 case EXIT_REASON_MCE_DURING_VMENTRY:
5130 return false;
5131 case EXIT_REASON_TPR_BELOW_THRESHOLD:
5132 return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW);
5133 case EXIT_REASON_APIC_ACCESS:
5134 case EXIT_REASON_APIC_WRITE:
5135 case EXIT_REASON_EOI_INDUCED:
5136 /*
5137 * The controls for "virtualize APIC accesses," "APIC-
5138 * register virtualization," and "virtual-interrupt
5139 * delivery" only come from vmcs12.
5140 */
5141 return true;
5142 case EXIT_REASON_EPT_VIOLATION:
5143 /*
5144 * L0 always deals with the EPT violation. If nested EPT is
5145 * used, and the nested mmu code discovers that the address is
5146 * missing in the guest EPT table (EPT12), the EPT violation
5147 * will be injected with nested_ept_inject_page_fault()
5148 */
5149 return false;
5150 case EXIT_REASON_EPT_MISCONFIG:
5151 /*
5152 * L2 never uses directly L1's EPT, but rather L0's own EPT
5153 * table (shadow on EPT) or a merged EPT table that L0 built
5154 * (EPT on EPT). So any problems with the structure of the
5155 * table is L0's fault.
5156 */
5157 return false;
5158 case EXIT_REASON_INVPCID:
5159 return
5160 nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_INVPCID) &&
5161 nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING);
5162 case EXIT_REASON_WBINVD:
5163 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING);
5164 case EXIT_REASON_XSETBV:
5165 return true;
5166 case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS:
5167 /*
5168 * This should never happen, since it is not possible to
5169 * set XSS to a non-zero value---neither in L1 nor in L2.
5170 * If if it were, XSS would have to be checked against
5171 * the XSS exit bitmap in vmcs12.
5172 */
5173 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES);
5174 case EXIT_REASON_PREEMPTION_TIMER:
5175 return false;
5176 case EXIT_REASON_PML_FULL:
5177 /* We emulate PML support to L1. */
5178 return false;
5179 case EXIT_REASON_VMFUNC:
5180 /* VM functions are emulated through L2->L0 vmexits. */
5181 return false;
5182 case EXIT_REASON_ENCLS:
5183 /* SGX is never exposed to L1 */
5184 return false;
5185 default:
5186 return true;
5187 }
5188}
5189
5190
5191static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
5192 struct kvm_nested_state __user *user_kvm_nested_state,
5193 u32 user_data_size)
5194{
5195 struct vcpu_vmx *vmx;
5196 struct vmcs12 *vmcs12;
5197 struct kvm_nested_state kvm_state = {
5198 .flags = 0,
5199 .format = 0,
5200 .size = sizeof(kvm_state),
5201 .vmx.vmxon_pa = -1ull,
5202 .vmx.vmcs_pa = -1ull,
5203 };
5204
5205 if (!vcpu)
5206 return kvm_state.size + 2 * VMCS12_SIZE;
5207
5208 vmx = to_vmx(vcpu);
5209 vmcs12 = get_vmcs12(vcpu);
5210
5211 if (nested_vmx_allowed(vcpu) && vmx->nested.enlightened_vmcs_enabled)
5212 kvm_state.flags |= KVM_STATE_NESTED_EVMCS;
5213
5214 if (nested_vmx_allowed(vcpu) &&
5215 (vmx->nested.vmxon || vmx->nested.smm.vmxon)) {
5216 kvm_state.vmx.vmxon_pa = vmx->nested.vmxon_ptr;
5217 kvm_state.vmx.vmcs_pa = vmx->nested.current_vmptr;
5218
5219 if (vmx_has_valid_vmcs12(vcpu)) {
5220 kvm_state.size += VMCS12_SIZE;
5221
5222 if (is_guest_mode(vcpu) &&
5223 nested_cpu_has_shadow_vmcs(vmcs12) &&
5224 vmcs12->vmcs_link_pointer != -1ull)
5225 kvm_state.size += VMCS12_SIZE;
5226 }
5227
5228 if (vmx->nested.smm.vmxon)
5229 kvm_state.vmx.smm.flags |= KVM_STATE_NESTED_SMM_VMXON;
5230
5231 if (vmx->nested.smm.guest_mode)
5232 kvm_state.vmx.smm.flags |= KVM_STATE_NESTED_SMM_GUEST_MODE;
5233
5234 if (is_guest_mode(vcpu)) {
5235 kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE;
5236
5237 if (vmx->nested.nested_run_pending)
5238 kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING;
5239 }
5240 }
5241
5242 if (user_data_size < kvm_state.size)
5243 goto out;
5244
5245 if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state)))
5246 return -EFAULT;
5247
5248 if (!vmx_has_valid_vmcs12(vcpu))
5249 goto out;
5250
5251 /*
5252 * When running L2, the authoritative vmcs12 state is in the
5253 * vmcs02. When running L1, the authoritative vmcs12 state is
5254 * in the shadow or enlightened vmcs linked to vmcs01, unless
5255 * need_vmcs12_sync is set, in which case, the authoritative
5256 * vmcs12 state is in the vmcs12 already.
5257 */
5258 if (is_guest_mode(vcpu)) {
5259 sync_vmcs12(vcpu, vmcs12);
5260 } else if (!vmx->nested.need_vmcs12_sync) {
5261 if (vmx->nested.hv_evmcs)
5262 copy_enlightened_to_vmcs12(vmx);
5263 else if (enable_shadow_vmcs)
5264 copy_shadow_to_vmcs12(vmx);
5265 }
5266
5267 if (copy_to_user(user_kvm_nested_state->data, vmcs12, sizeof(*vmcs12)))
5268 return -EFAULT;
5269
5270 if (nested_cpu_has_shadow_vmcs(vmcs12) &&
5271 vmcs12->vmcs_link_pointer != -1ull) {
5272 if (copy_to_user(user_kvm_nested_state->data + VMCS12_SIZE,
5273 get_shadow_vmcs12(vcpu), sizeof(*vmcs12)))
5274 return -EFAULT;
5275 }
5276
5277out:
5278 return kvm_state.size;
5279}
5280
5281/*
5282 * Forcibly leave nested mode in order to be able to reset the VCPU later on.
5283 */
5284void vmx_leave_nested(struct kvm_vcpu *vcpu)
5285{
5286 if (is_guest_mode(vcpu)) {
5287 to_vmx(vcpu)->nested.nested_run_pending = 0;
5288 nested_vmx_vmexit(vcpu, -1, 0, 0);
5289 }
5290 free_nested(vcpu);
5291}
5292
5293static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
5294 struct kvm_nested_state __user *user_kvm_nested_state,
5295 struct kvm_nested_state *kvm_state)
5296{
5297 struct vcpu_vmx *vmx = to_vmx(vcpu);
5298 struct vmcs12 *vmcs12;
5299 u32 exit_qual;
5300 int ret;
5301
5302 if (kvm_state->format != 0)
5303 return -EINVAL;
5304
5305 if (kvm_state->flags & KVM_STATE_NESTED_EVMCS)
5306 nested_enable_evmcs(vcpu, NULL);
5307
5308 if (!nested_vmx_allowed(vcpu))
5309 return kvm_state->vmx.vmxon_pa == -1ull ? 0 : -EINVAL;
5310
5311 if (kvm_state->vmx.vmxon_pa == -1ull) {
5312 if (kvm_state->vmx.smm.flags)
5313 return -EINVAL;
5314
5315 if (kvm_state->vmx.vmcs_pa != -1ull)
5316 return -EINVAL;
5317
5318 vmx_leave_nested(vcpu);
5319 return 0;
5320 }
5321
5322 if (!page_address_valid(vcpu, kvm_state->vmx.vmxon_pa))
5323 return -EINVAL;
5324
5325 if ((kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) &&
5326 (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
5327 return -EINVAL;
5328
5329 if (kvm_state->vmx.smm.flags &
5330 ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON))
5331 return -EINVAL;
5332
5333 /*
5334 * SMM temporarily disables VMX, so we cannot be in guest mode,
5335 * nor can VMLAUNCH/VMRESUME be pending. Outside SMM, SMM flags
5336 * must be zero.
5337 */
5338 if (is_smm(vcpu) ? kvm_state->flags : kvm_state->vmx.smm.flags)
5339 return -EINVAL;
5340
5341 if ((kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) &&
5342 !(kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON))
5343 return -EINVAL;
5344
5345 vmx_leave_nested(vcpu);
5346 if (kvm_state->vmx.vmxon_pa == -1ull)
5347 return 0;
5348
5349 vmx->nested.vmxon_ptr = kvm_state->vmx.vmxon_pa;
5350 ret = enter_vmx_operation(vcpu);
5351 if (ret)
5352 return ret;
5353
5354 /* Empty 'VMXON' state is permitted */
5355 if (kvm_state->size < sizeof(kvm_state) + sizeof(*vmcs12))
5356 return 0;
5357
5358 if (kvm_state->vmx.vmcs_pa != -1ull) {
5359 if (kvm_state->vmx.vmcs_pa == kvm_state->vmx.vmxon_pa ||
5360 !page_address_valid(vcpu, kvm_state->vmx.vmcs_pa))
5361 return -EINVAL;
5362
5363 set_current_vmptr(vmx, kvm_state->vmx.vmcs_pa);
5364 } else if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) {
5365 /*
5366 * Sync eVMCS upon entry as we may not have
5367 * HV_X64_MSR_VP_ASSIST_PAGE set up yet.
5368 */
5369 vmx->nested.need_vmcs12_sync = true;
5370 } else {
5371 return -EINVAL;
5372 }
5373
5374 if (kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON) {
5375 vmx->nested.smm.vmxon = true;
5376 vmx->nested.vmxon = false;
5377
5378 if (kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE)
5379 vmx->nested.smm.guest_mode = true;
5380 }
5381
5382 vmcs12 = get_vmcs12(vcpu);
5383 if (copy_from_user(vmcs12, user_kvm_nested_state->data, sizeof(*vmcs12)))
5384 return -EFAULT;
5385
5386 if (vmcs12->hdr.revision_id != VMCS12_REVISION)
5387 return -EINVAL;
5388
5389 if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
5390 return 0;
5391
5392 vmx->nested.nested_run_pending =
5393 !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING);
5394
5395 if (nested_cpu_has_shadow_vmcs(vmcs12) &&
5396 vmcs12->vmcs_link_pointer != -1ull) {
5397 struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu);
5398
5399 if (kvm_state->size < sizeof(kvm_state) + 2 * sizeof(*vmcs12))
5400 return -EINVAL;
5401
5402 if (copy_from_user(shadow_vmcs12,
5403 user_kvm_nested_state->data + VMCS12_SIZE,
5404 sizeof(*vmcs12)))
5405 return -EFAULT;
5406
5407 if (shadow_vmcs12->hdr.revision_id != VMCS12_REVISION ||
5408 !shadow_vmcs12->hdr.shadow_vmcs)
5409 return -EINVAL;
5410 }
5411
5412 if (nested_vmx_check_vmentry_prereqs(vcpu, vmcs12) ||
5413 nested_vmx_check_vmentry_postreqs(vcpu, vmcs12, &exit_qual))
5414 return -EINVAL;
5415
5416 vmx->nested.dirty_vmcs12 = true;
5417 ret = nested_vmx_enter_non_root_mode(vcpu, false);
5418 if (ret)
5419 return -EINVAL;
5420
5421 return 0;
5422}
5423
5424void nested_vmx_vcpu_setup(void)
5425{
5426 if (enable_shadow_vmcs) {
5427 /*
5428 * At vCPU creation, "VMWRITE to any supported field
5429 * in the VMCS" is supported, so use the more
5430 * permissive vmx_vmread_bitmap to specify both read
5431 * and write permissions for the shadow VMCS.
5432 */
5433 vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap));
5434 vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmread_bitmap));
5435 }
5436}
5437
5438/*
5439 * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be
5440 * returned for the various VMX controls MSRs when nested VMX is enabled.
5441 * The same values should also be used to verify that vmcs12 control fields are
5442 * valid during nested entry from L1 to L2.
5443 * Each of these control msrs has a low and high 32-bit half: A low bit is on
5444 * if the corresponding bit in the (32-bit) control field *must* be on, and a
5445 * bit in the high half is on if the corresponding bit in the control field
5446 * may be on. See also vmx_control_verify().
5447 */
5448void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps,
5449 bool apicv)
5450{
5451 /*
5452 * Note that as a general rule, the high half of the MSRs (bits in
5453 * the control fields which may be 1) should be initialized by the
5454 * intersection of the underlying hardware's MSR (i.e., features which
5455 * can be supported) and the list of features we want to expose -
5456 * because they are known to be properly supported in our code.
5457 * Also, usually, the low half of the MSRs (bits which must be 1) can
5458 * be set to 0, meaning that L1 may turn off any of these bits. The
5459 * reason is that if one of these bits is necessary, it will appear
5460 * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control
5461 * fields of vmcs01 and vmcs02, will turn these bits off - and
5462 * nested_vmx_exit_reflected() will not pass related exits to L1.
5463 * These rules have exceptions below.
5464 */
5465
5466 /* pin-based controls */
5467 rdmsr(MSR_IA32_VMX_PINBASED_CTLS,
5468 msrs->pinbased_ctls_low,
5469 msrs->pinbased_ctls_high);
5470 msrs->pinbased_ctls_low |=
5471 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
5472 msrs->pinbased_ctls_high &=
5473 PIN_BASED_EXT_INTR_MASK |
5474 PIN_BASED_NMI_EXITING |
5475 PIN_BASED_VIRTUAL_NMIS |
5476 (apicv ? PIN_BASED_POSTED_INTR : 0);
5477 msrs->pinbased_ctls_high |=
5478 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
5479 PIN_BASED_VMX_PREEMPTION_TIMER;
5480
5481 /* exit controls */
5482 rdmsr(MSR_IA32_VMX_EXIT_CTLS,
5483 msrs->exit_ctls_low,
5484 msrs->exit_ctls_high);
5485 msrs->exit_ctls_low =
5486 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
5487
5488 msrs->exit_ctls_high &=
5489#ifdef CONFIG_X86_64
5490 VM_EXIT_HOST_ADDR_SPACE_SIZE |
5491#endif
5492 VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
5493 msrs->exit_ctls_high |=
5494 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
5495 VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER |
5496 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT;
5497
5498 /* We support free control of debug control saving. */
5499 msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS;
5500
5501 /* entry controls */
5502 rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
5503 msrs->entry_ctls_low,
5504 msrs->entry_ctls_high);
5505 msrs->entry_ctls_low =
5506 VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
5507 msrs->entry_ctls_high &=
5508#ifdef CONFIG_X86_64
5509 VM_ENTRY_IA32E_MODE |
5510#endif
5511 VM_ENTRY_LOAD_IA32_PAT;
5512 msrs->entry_ctls_high |=
5513 (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER);
5514
5515 /* We support free control of debug control loading. */
5516 msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS;
5517
5518 /* cpu-based controls */
5519 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
5520 msrs->procbased_ctls_low,
5521 msrs->procbased_ctls_high);
5522 msrs->procbased_ctls_low =
5523 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
5524 msrs->procbased_ctls_high &=
5525 CPU_BASED_VIRTUAL_INTR_PENDING |
5526 CPU_BASED_VIRTUAL_NMI_PENDING | CPU_BASED_USE_TSC_OFFSETING |
5527 CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING |
5528 CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING |
5529 CPU_BASED_CR3_STORE_EXITING |
5530#ifdef CONFIG_X86_64
5531 CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING |
5532#endif
5533 CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING |
5534 CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_TRAP_FLAG |
5535 CPU_BASED_MONITOR_EXITING | CPU_BASED_RDPMC_EXITING |
5536 CPU_BASED_RDTSC_EXITING | CPU_BASED_PAUSE_EXITING |
5537 CPU_BASED_TPR_SHADOW | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
5538 /*
5539 * We can allow some features even when not supported by the
5540 * hardware. For example, L1 can specify an MSR bitmap - and we
5541 * can use it to avoid exits to L1 - even when L0 runs L2
5542 * without MSR bitmaps.
5543 */
5544 msrs->procbased_ctls_high |=
5545 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
5546 CPU_BASED_USE_MSR_BITMAPS;
5547
5548 /* We support free control of CR3 access interception. */
5549 msrs->procbased_ctls_low &=
5550 ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING);
5551
5552 /*
5553 * secondary cpu-based controls. Do not include those that
5554 * depend on CPUID bits, they are added later by vmx_cpuid_update.
5555 */
5556 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
5557 msrs->secondary_ctls_low,
5558 msrs->secondary_ctls_high);
5559 msrs->secondary_ctls_low = 0;
5560 msrs->secondary_ctls_high &=
5561 SECONDARY_EXEC_DESC |
5562 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
5563 SECONDARY_EXEC_APIC_REGISTER_VIRT |
5564 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
5565 SECONDARY_EXEC_WBINVD_EXITING;
5566
5567 /*
5568 * We can emulate "VMCS shadowing," even if the hardware
5569 * doesn't support it.
5570 */
5571 msrs->secondary_ctls_high |=
5572 SECONDARY_EXEC_SHADOW_VMCS;
5573
5574 if (enable_ept) {
5575 /* nested EPT: emulate EPT also to L1 */
5576 msrs->secondary_ctls_high |=
5577 SECONDARY_EXEC_ENABLE_EPT;
5578 msrs->ept_caps = VMX_EPT_PAGE_WALK_4_BIT |
5579 VMX_EPTP_WB_BIT | VMX_EPT_INVEPT_BIT;
5580 if (cpu_has_vmx_ept_execute_only())
5581 msrs->ept_caps |=
5582 VMX_EPT_EXECUTE_ONLY_BIT;
5583 msrs->ept_caps &= ept_caps;
5584 msrs->ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT |
5585 VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT |
5586 VMX_EPT_1GB_PAGE_BIT;
5587 if (enable_ept_ad_bits) {
5588 msrs->secondary_ctls_high |=
5589 SECONDARY_EXEC_ENABLE_PML;
5590 msrs->ept_caps |= VMX_EPT_AD_BIT;
5591 }
5592 }
5593
5594 if (cpu_has_vmx_vmfunc()) {
5595 msrs->secondary_ctls_high |=
5596 SECONDARY_EXEC_ENABLE_VMFUNC;
5597 /*
5598 * Advertise EPTP switching unconditionally
5599 * since we emulate it
5600 */
5601 if (enable_ept)
5602 msrs->vmfunc_controls =
5603 VMX_VMFUNC_EPTP_SWITCHING;
5604 }
5605
5606 /*
5607 * Old versions of KVM use the single-context version without
5608 * checking for support, so declare that it is supported even
5609 * though it is treated as global context. The alternative is
5610 * not failing the single-context invvpid, and it is worse.
5611 */
5612 if (enable_vpid) {
5613 msrs->secondary_ctls_high |=
5614 SECONDARY_EXEC_ENABLE_VPID;
5615 msrs->vpid_caps = VMX_VPID_INVVPID_BIT |
5616 VMX_VPID_EXTENT_SUPPORTED_MASK;
5617 }
5618
5619 if (enable_unrestricted_guest)
5620 msrs->secondary_ctls_high |=
5621 SECONDARY_EXEC_UNRESTRICTED_GUEST;
5622
5623 if (flexpriority_enabled)
5624 msrs->secondary_ctls_high |=
5625 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
5626
5627 /* miscellaneous data */
5628 rdmsr(MSR_IA32_VMX_MISC,
5629 msrs->misc_low,
5630 msrs->misc_high);
5631 msrs->misc_low &= VMX_MISC_SAVE_EFER_LMA;
5632 msrs->misc_low |=
5633 MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS |
5634 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE |
5635 VMX_MISC_ACTIVITY_HLT;
5636 msrs->misc_high = 0;
5637
5638 /*
5639 * This MSR reports some information about VMX support. We
5640 * should return information about the VMX we emulate for the
5641 * guest, and the VMCS structure we give it - not about the
5642 * VMX support of the underlying hardware.
5643 */
5644 msrs->basic =
5645 VMCS12_REVISION |
5646 VMX_BASIC_TRUE_CTLS |
5647 ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) |
5648 (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT);
5649
5650 if (cpu_has_vmx_basic_inout())
5651 msrs->basic |= VMX_BASIC_INOUT;
5652
5653 /*
5654 * These MSRs specify bits which the guest must keep fixed on
5655 * while L1 is in VMXON mode (in L1's root mode, or running an L2).
5656 * We picked the standard core2 setting.
5657 */
5658#define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE)
5659#define VMXON_CR4_ALWAYSON X86_CR4_VMXE
5660 msrs->cr0_fixed0 = VMXON_CR0_ALWAYSON;
5661 msrs->cr4_fixed0 = VMXON_CR4_ALWAYSON;
5662
5663 /* These MSRs specify bits which the guest must keep fixed off. */
5664 rdmsrl(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1);
5665 rdmsrl(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1);
5666
5667 /* highest index: VMX_PREEMPTION_TIMER_VALUE */
5668 msrs->vmcs_enum = VMCS12_MAX_FIELD_INDEX << 1;
5669}
5670
5671void nested_vmx_hardware_unsetup(void)
5672{
5673 int i;
5674
5675 if (enable_shadow_vmcs) {
5676 for (i = 0; i < VMX_BITMAP_NR; i++)
5677 free_page((unsigned long)vmx_bitmap[i]);
5678 }
5679}
5680
5681__init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *))
5682{
5683 int i;
5684
5685 if (!cpu_has_vmx_shadow_vmcs())
5686 enable_shadow_vmcs = 0;
5687 if (enable_shadow_vmcs) {
5688 for (i = 0; i < VMX_BITMAP_NR; i++) {
5689 vmx_bitmap[i] = (unsigned long *)
5690 __get_free_page(GFP_KERNEL);
5691 if (!vmx_bitmap[i]) {
5692 nested_vmx_hardware_unsetup();
5693 return -ENOMEM;
5694 }
5695 }
5696
5697 init_vmcs_shadow_fields();
5698 }
5699
5700 exit_handlers[EXIT_REASON_VMCLEAR] = handle_vmclear,
5701 exit_handlers[EXIT_REASON_VMLAUNCH] = handle_vmlaunch,
5702 exit_handlers[EXIT_REASON_VMPTRLD] = handle_vmptrld,
5703 exit_handlers[EXIT_REASON_VMPTRST] = handle_vmptrst,
5704 exit_handlers[EXIT_REASON_VMREAD] = handle_vmread,
5705 exit_handlers[EXIT_REASON_VMRESUME] = handle_vmresume,
5706 exit_handlers[EXIT_REASON_VMWRITE] = handle_vmwrite,
5707 exit_handlers[EXIT_REASON_VMOFF] = handle_vmoff,
5708 exit_handlers[EXIT_REASON_VMON] = handle_vmon,
5709 exit_handlers[EXIT_REASON_INVEPT] = handle_invept,
5710 exit_handlers[EXIT_REASON_INVVPID] = handle_invvpid,
5711 exit_handlers[EXIT_REASON_VMFUNC] = handle_vmfunc,
5712
5713 kvm_x86_ops->check_nested_events = vmx_check_nested_events;
5714 kvm_x86_ops->get_nested_state = vmx_get_nested_state;
5715 kvm_x86_ops->set_nested_state = vmx_set_nested_state;
5716 kvm_x86_ops->get_vmcs12_pages = nested_get_vmcs12_pages,
5717 kvm_x86_ops->nested_enable_evmcs = nested_enable_evmcs;
5718 kvm_x86_ops->nested_get_evmcs_version = nested_get_evmcs_version;
5719
5720 return 0;
5721}
diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h
new file mode 100644
index 000000000000..e847ff1019a2
--- /dev/null
+++ b/arch/x86/kvm/vmx/nested.h
@@ -0,0 +1,282 @@
1/* SPDX-License-Identifier: GPL-2.0 */
2#ifndef __KVM_X86_VMX_NESTED_H
3#define __KVM_X86_VMX_NESTED_H
4
5#include "kvm_cache_regs.h"
6#include "vmcs12.h"
7#include "vmx.h"
8
9void vmx_leave_nested(struct kvm_vcpu *vcpu);
10void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps,
11 bool apicv);
12void nested_vmx_hardware_unsetup(void);
13__init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *));
14void nested_vmx_vcpu_setup(void);
15void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu);
16int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry);
17bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason);
18void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
19 u32 exit_intr_info, unsigned long exit_qualification);
20void nested_sync_from_vmcs12(struct kvm_vcpu *vcpu);
21int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
22int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata);
23int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification,
24 u32 vmx_instruction_info, bool wr, gva_t *ret);
25
26static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
27{
28 return to_vmx(vcpu)->nested.cached_vmcs12;
29}
30
31static inline struct vmcs12 *get_shadow_vmcs12(struct kvm_vcpu *vcpu)
32{
33 return to_vmx(vcpu)->nested.cached_shadow_vmcs12;
34}
35
36static inline int vmx_has_valid_vmcs12(struct kvm_vcpu *vcpu)
37{
38 struct vcpu_vmx *vmx = to_vmx(vcpu);
39
40 /*
41 * In case we do two consecutive get/set_nested_state()s while L2 was
42 * running hv_evmcs may end up not being mapped (we map it from
43 * nested_vmx_run()/vmx_vcpu_run()). Check is_guest_mode() as we always
44 * have vmcs12 if it is true.
45 */
46 return is_guest_mode(vcpu) || vmx->nested.current_vmptr != -1ull ||
47 vmx->nested.hv_evmcs;
48}
49
50static inline unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu)
51{
52 /* return the page table to be shadowed - in our case, EPT12 */
53 return get_vmcs12(vcpu)->ept_pointer;
54}
55
56static inline bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu)
57{
58 return nested_ept_get_cr3(vcpu) & VMX_EPTP_AD_ENABLE_BIT;
59}
60
61/*
62 * Reflect a VM Exit into L1.
63 */
64static inline int nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu,
65 u32 exit_reason)
66{
67 u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
68
69 /*
70 * At this point, the exit interruption info in exit_intr_info
71 * is only valid for EXCEPTION_NMI exits. For EXTERNAL_INTERRUPT
72 * we need to query the in-kernel LAPIC.
73 */
74 WARN_ON(exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT);
75 if ((exit_intr_info &
76 (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) ==
77 (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) {
78 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
79
80 vmcs12->vm_exit_intr_error_code =
81 vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
82 }
83
84 nested_vmx_vmexit(vcpu, exit_reason, exit_intr_info,
85 vmcs_readl(EXIT_QUALIFICATION));
86 return 1;
87}
88
89/*
90 * Return the cr0 value that a nested guest would read. This is a combination
91 * of the real cr0 used to run the guest (guest_cr0), and the bits shadowed by
92 * its hypervisor (cr0_read_shadow).
93 */
94static inline unsigned long nested_read_cr0(struct vmcs12 *fields)
95{
96 return (fields->guest_cr0 & ~fields->cr0_guest_host_mask) |
97 (fields->cr0_read_shadow & fields->cr0_guest_host_mask);
98}
99static inline unsigned long nested_read_cr4(struct vmcs12 *fields)
100{
101 return (fields->guest_cr4 & ~fields->cr4_guest_host_mask) |
102 (fields->cr4_read_shadow & fields->cr4_guest_host_mask);
103}
104
105static inline unsigned nested_cpu_vmx_misc_cr3_count(struct kvm_vcpu *vcpu)
106{
107 return vmx_misc_cr3_count(to_vmx(vcpu)->nested.msrs.misc_low);
108}
109
110/*
111 * Do the virtual VMX capability MSRs specify that L1 can use VMWRITE
112 * to modify any valid field of the VMCS, or are the VM-exit
113 * information fields read-only?
114 */
115static inline bool nested_cpu_has_vmwrite_any_field(struct kvm_vcpu *vcpu)
116{
117 return to_vmx(vcpu)->nested.msrs.misc_low &
118 MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS;
119}
120
121static inline bool nested_cpu_has_zero_length_injection(struct kvm_vcpu *vcpu)
122{
123 return to_vmx(vcpu)->nested.msrs.misc_low & VMX_MISC_ZERO_LEN_INS;
124}
125
126static inline bool nested_cpu_supports_monitor_trap_flag(struct kvm_vcpu *vcpu)
127{
128 return to_vmx(vcpu)->nested.msrs.procbased_ctls_high &
129 CPU_BASED_MONITOR_TRAP_FLAG;
130}
131
132static inline bool nested_cpu_has_vmx_shadow_vmcs(struct kvm_vcpu *vcpu)
133{
134 return to_vmx(vcpu)->nested.msrs.secondary_ctls_high &
135 SECONDARY_EXEC_SHADOW_VMCS;
136}
137
138static inline bool nested_cpu_has(struct vmcs12 *vmcs12, u32 bit)
139{
140 return vmcs12->cpu_based_vm_exec_control & bit;
141}
142
143static inline bool nested_cpu_has2(struct vmcs12 *vmcs12, u32 bit)
144{
145 return (vmcs12->cpu_based_vm_exec_control &
146 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) &&
147 (vmcs12->secondary_vm_exec_control & bit);
148}
149
150static inline bool nested_cpu_has_preemption_timer(struct vmcs12 *vmcs12)
151{
152 return vmcs12->pin_based_vm_exec_control &
153 PIN_BASED_VMX_PREEMPTION_TIMER;
154}
155
156static inline bool nested_cpu_has_nmi_exiting(struct vmcs12 *vmcs12)
157{
158 return vmcs12->pin_based_vm_exec_control & PIN_BASED_NMI_EXITING;
159}
160
161static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12)
162{
163 return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS;
164}
165
166static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12)
167{
168 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT);
169}
170
171static inline bool nested_cpu_has_xsaves(struct vmcs12 *vmcs12)
172{
173 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES);
174}
175
176static inline bool nested_cpu_has_pml(struct vmcs12 *vmcs12)
177{
178 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_PML);
179}
180
181static inline bool nested_cpu_has_virt_x2apic_mode(struct vmcs12 *vmcs12)
182{
183 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
184}
185
186static inline bool nested_cpu_has_vpid(struct vmcs12 *vmcs12)
187{
188 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VPID);
189}
190
191static inline bool nested_cpu_has_apic_reg_virt(struct vmcs12 *vmcs12)
192{
193 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_APIC_REGISTER_VIRT);
194}
195
196static inline bool nested_cpu_has_vid(struct vmcs12 *vmcs12)
197{
198 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
199}
200
201static inline bool nested_cpu_has_posted_intr(struct vmcs12 *vmcs12)
202{
203 return vmcs12->pin_based_vm_exec_control & PIN_BASED_POSTED_INTR;
204}
205
206static inline bool nested_cpu_has_vmfunc(struct vmcs12 *vmcs12)
207{
208 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VMFUNC);
209}
210
211static inline bool nested_cpu_has_eptp_switching(struct vmcs12 *vmcs12)
212{
213 return nested_cpu_has_vmfunc(vmcs12) &&
214 (vmcs12->vm_function_control &
215 VMX_VMFUNC_EPTP_SWITCHING);
216}
217
218static inline bool nested_cpu_has_shadow_vmcs(struct vmcs12 *vmcs12)
219{
220 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_SHADOW_VMCS);
221}
222
223static inline bool nested_cpu_has_save_preemption_timer(struct vmcs12 *vmcs12)
224{
225 return vmcs12->vm_exit_controls &
226 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER;
227}
228
229/*
230 * In nested virtualization, check if L1 asked to exit on external interrupts.
231 * For most existing hypervisors, this will always return true.
232 */
233static inline bool nested_exit_on_intr(struct kvm_vcpu *vcpu)
234{
235 return get_vmcs12(vcpu)->pin_based_vm_exec_control &
236 PIN_BASED_EXT_INTR_MASK;
237}
238
239/*
240 * if fixed0[i] == 1: val[i] must be 1
241 * if fixed1[i] == 0: val[i] must be 0
242 */
243static inline bool fixed_bits_valid(u64 val, u64 fixed0, u64 fixed1)
244{
245 return ((val & fixed1) | fixed0) == val;
246}
247
248static bool nested_guest_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val)
249{
250 u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr0_fixed0;
251 u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr0_fixed1;
252 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
253
254 if (to_vmx(vcpu)->nested.msrs.secondary_ctls_high &
255 SECONDARY_EXEC_UNRESTRICTED_GUEST &&
256 nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST))
257 fixed0 &= ~(X86_CR0_PE | X86_CR0_PG);
258
259 return fixed_bits_valid(val, fixed0, fixed1);
260}
261
262static bool nested_host_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val)
263{
264 u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr0_fixed0;
265 u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr0_fixed1;
266
267 return fixed_bits_valid(val, fixed0, fixed1);
268}
269
270static bool nested_cr4_valid(struct kvm_vcpu *vcpu, unsigned long val)
271{
272 u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr4_fixed0;
273 u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr4_fixed1;
274
275 return fixed_bits_valid(val, fixed0, fixed1);
276}
277
278/* No difference in the restrictions on guest and host CR4 in VMX operation. */
279#define nested_guest_cr4_valid nested_cr4_valid
280#define nested_host_cr4_valid nested_cr4_valid
281
282#endif /* __KVM_X86_VMX_NESTED_H */
diff --git a/arch/x86/kvm/vmx/ops.h b/arch/x86/kvm/vmx/ops.h
new file mode 100644
index 000000000000..b8e50f76fefc
--- /dev/null
+++ b/arch/x86/kvm/vmx/ops.h
@@ -0,0 +1,285 @@
1/* SPDX-License-Identifier: GPL-2.0 */
2#ifndef __KVM_X86_VMX_INSN_H
3#define __KVM_X86_VMX_INSN_H
4
5#include <linux/nospec.h>
6
7#include <asm/kvm_host.h>
8#include <asm/vmx.h>
9
10#include "evmcs.h"
11#include "vmcs.h"
12
13#define __ex(x) __kvm_handle_fault_on_reboot(x)
14#define __ex_clear(x, reg) \
15 ____kvm_handle_fault_on_reboot(x, "xor " reg ", " reg)
16
17static __always_inline void vmcs_check16(unsigned long field)
18{
19 BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2000,
20 "16-bit accessor invalid for 64-bit field");
21 BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001,
22 "16-bit accessor invalid for 64-bit high field");
23 BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000,
24 "16-bit accessor invalid for 32-bit high field");
25 BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000,
26 "16-bit accessor invalid for natural width field");
27}
28
29static __always_inline void vmcs_check32(unsigned long field)
30{
31 BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0,
32 "32-bit accessor invalid for 16-bit field");
33 BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000,
34 "32-bit accessor invalid for natural width field");
35}
36
37static __always_inline void vmcs_check64(unsigned long field)
38{
39 BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0,
40 "64-bit accessor invalid for 16-bit field");
41 BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001,
42 "64-bit accessor invalid for 64-bit high field");
43 BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000,
44 "64-bit accessor invalid for 32-bit field");
45 BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000,
46 "64-bit accessor invalid for natural width field");
47}
48
49static __always_inline void vmcs_checkl(unsigned long field)
50{
51 BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0,
52 "Natural width accessor invalid for 16-bit field");
53 BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2000,
54 "Natural width accessor invalid for 64-bit field");
55 BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001,
56 "Natural width accessor invalid for 64-bit high field");
57 BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000,
58 "Natural width accessor invalid for 32-bit field");
59}
60
61static __always_inline unsigned long __vmcs_readl(unsigned long field)
62{
63 unsigned long value;
64
65 asm volatile (__ex_clear("vmread %1, %0", "%k0")
66 : "=r"(value) : "r"(field));
67 return value;
68}
69
70static __always_inline u16 vmcs_read16(unsigned long field)
71{
72 vmcs_check16(field);
73 if (static_branch_unlikely(&enable_evmcs))
74 return evmcs_read16(field);
75 return __vmcs_readl(field);
76}
77
78static __always_inline u32 vmcs_read32(unsigned long field)
79{
80 vmcs_check32(field);
81 if (static_branch_unlikely(&enable_evmcs))
82 return evmcs_read32(field);
83 return __vmcs_readl(field);
84}
85
86static __always_inline u64 vmcs_read64(unsigned long field)
87{
88 vmcs_check64(field);
89 if (static_branch_unlikely(&enable_evmcs))
90 return evmcs_read64(field);
91#ifdef CONFIG_X86_64
92 return __vmcs_readl(field);
93#else
94 return __vmcs_readl(field) | ((u64)__vmcs_readl(field+1) << 32);
95#endif
96}
97
98static __always_inline unsigned long vmcs_readl(unsigned long field)
99{
100 vmcs_checkl(field);
101 if (static_branch_unlikely(&enable_evmcs))
102 return evmcs_read64(field);
103 return __vmcs_readl(field);
104}
105
106static noinline void vmwrite_error(unsigned long field, unsigned long value)
107{
108 printk(KERN_ERR "vmwrite error: reg %lx value %lx (err %d)\n",
109 field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
110 dump_stack();
111}
112
113static __always_inline void __vmcs_writel(unsigned long field, unsigned long value)
114{
115 bool error;
116
117 asm volatile (__ex("vmwrite %2, %1") CC_SET(na)
118 : CC_OUT(na) (error) : "r"(field), "rm"(value));
119 if (unlikely(error))
120 vmwrite_error(field, value);
121}
122
123static __always_inline void vmcs_write16(unsigned long field, u16 value)
124{
125 vmcs_check16(field);
126 if (static_branch_unlikely(&enable_evmcs))
127 return evmcs_write16(field, value);
128
129 __vmcs_writel(field, value);
130}
131
132static __always_inline void vmcs_write32(unsigned long field, u32 value)
133{
134 vmcs_check32(field);
135 if (static_branch_unlikely(&enable_evmcs))
136 return evmcs_write32(field, value);
137
138 __vmcs_writel(field, value);
139}
140
141static __always_inline void vmcs_write64(unsigned long field, u64 value)
142{
143 vmcs_check64(field);
144 if (static_branch_unlikely(&enable_evmcs))
145 return evmcs_write64(field, value);
146
147 __vmcs_writel(field, value);
148#ifndef CONFIG_X86_64
149 asm volatile ("");
150 __vmcs_writel(field+1, value >> 32);
151#endif
152}
153
154static __always_inline void vmcs_writel(unsigned long field, unsigned long value)
155{
156 vmcs_checkl(field);
157 if (static_branch_unlikely(&enable_evmcs))
158 return evmcs_write64(field, value);
159
160 __vmcs_writel(field, value);
161}
162
163static __always_inline void vmcs_clear_bits(unsigned long field, u32 mask)
164{
165 BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000,
166 "vmcs_clear_bits does not support 64-bit fields");
167 if (static_branch_unlikely(&enable_evmcs))
168 return evmcs_write32(field, evmcs_read32(field) & ~mask);
169
170 __vmcs_writel(field, __vmcs_readl(field) & ~mask);
171}
172
173static __always_inline void vmcs_set_bits(unsigned long field, u32 mask)
174{
175 BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000,
176 "vmcs_set_bits does not support 64-bit fields");
177 if (static_branch_unlikely(&enable_evmcs))
178 return evmcs_write32(field, evmcs_read32(field) | mask);
179
180 __vmcs_writel(field, __vmcs_readl(field) | mask);
181}
182
183static inline void vmcs_clear(struct vmcs *vmcs)
184{
185 u64 phys_addr = __pa(vmcs);
186 bool error;
187
188 asm volatile (__ex("vmclear %1") CC_SET(na)
189 : CC_OUT(na) (error) : "m"(phys_addr));
190 if (unlikely(error))
191 printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n",
192 vmcs, phys_addr);
193}
194
195static inline void vmcs_load(struct vmcs *vmcs)
196{
197 u64 phys_addr = __pa(vmcs);
198 bool error;
199
200 if (static_branch_unlikely(&enable_evmcs))
201 return evmcs_load(phys_addr);
202
203 asm volatile (__ex("vmptrld %1") CC_SET(na)
204 : CC_OUT(na) (error) : "m"(phys_addr));
205 if (unlikely(error))
206 printk(KERN_ERR "kvm: vmptrld %p/%llx failed\n",
207 vmcs, phys_addr);
208}
209
210static inline void __invvpid(unsigned long ext, u16 vpid, gva_t gva)
211{
212 struct {
213 u64 vpid : 16;
214 u64 rsvd : 48;
215 u64 gva;
216 } operand = { vpid, 0, gva };
217 bool error;
218
219 asm volatile (__ex("invvpid %2, %1") CC_SET(na)
220 : CC_OUT(na) (error) : "r"(ext), "m"(operand));
221 BUG_ON(error);
222}
223
224static inline void __invept(unsigned long ext, u64 eptp, gpa_t gpa)
225{
226 struct {
227 u64 eptp, gpa;
228 } operand = {eptp, gpa};
229 bool error;
230
231 asm volatile (__ex("invept %2, %1") CC_SET(na)
232 : CC_OUT(na) (error) : "r"(ext), "m"(operand));
233 BUG_ON(error);
234}
235
236static inline bool vpid_sync_vcpu_addr(int vpid, gva_t addr)
237{
238 if (vpid == 0)
239 return true;
240
241 if (cpu_has_vmx_invvpid_individual_addr()) {
242 __invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR, vpid, addr);
243 return true;
244 }
245
246 return false;
247}
248
249static inline void vpid_sync_vcpu_single(int vpid)
250{
251 if (vpid == 0)
252 return;
253
254 if (cpu_has_vmx_invvpid_single())
255 __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vpid, 0);
256}
257
258static inline void vpid_sync_vcpu_global(void)
259{
260 if (cpu_has_vmx_invvpid_global())
261 __invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0);
262}
263
264static inline void vpid_sync_context(int vpid)
265{
266 if (cpu_has_vmx_invvpid_single())
267 vpid_sync_vcpu_single(vpid);
268 else
269 vpid_sync_vcpu_global();
270}
271
272static inline void ept_sync_global(void)
273{
274 __invept(VMX_EPT_EXTENT_GLOBAL, 0, 0);
275}
276
277static inline void ept_sync_context(u64 eptp)
278{
279 if (cpu_has_vmx_invept_context())
280 __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0);
281 else
282 ept_sync_global();
283}
284
285#endif /* __KVM_X86_VMX_INSN_H */
diff --git a/arch/x86/kvm/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index 5ab4a364348e..5ab4a364348e 100644
--- a/arch/x86/kvm/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h
new file mode 100644
index 000000000000..6def3ba88e3b
--- /dev/null
+++ b/arch/x86/kvm/vmx/vmcs.h
@@ -0,0 +1,136 @@
1/* SPDX-License-Identifier: GPL-2.0 */
2#ifndef __KVM_X86_VMX_VMCS_H
3#define __KVM_X86_VMX_VMCS_H
4
5#include <linux/ktime.h>
6#include <linux/list.h>
7#include <linux/nospec.h>
8
9#include <asm/kvm.h>
10#include <asm/vmx.h>
11
12#include "capabilities.h"
13
14struct vmcs_hdr {
15 u32 revision_id:31;
16 u32 shadow_vmcs:1;
17};
18
19struct vmcs {
20 struct vmcs_hdr hdr;
21 u32 abort;
22 char data[0];
23};
24
25DECLARE_PER_CPU(struct vmcs *, current_vmcs);
26
27/*
28 * vmcs_host_state tracks registers that are loaded from the VMCS on VMEXIT
29 * and whose values change infrequently, but are not constant. I.e. this is
30 * used as a write-through cache of the corresponding VMCS fields.
31 */
32struct vmcs_host_state {
33 unsigned long cr3; /* May not match real cr3 */
34 unsigned long cr4; /* May not match real cr4 */
35 unsigned long gs_base;
36 unsigned long fs_base;
37
38 u16 fs_sel, gs_sel, ldt_sel;
39#ifdef CONFIG_X86_64
40 u16 ds_sel, es_sel;
41#endif
42};
43
44/*
45 * Track a VMCS that may be loaded on a certain CPU. If it is (cpu!=-1), also
46 * remember whether it was VMLAUNCHed, and maintain a linked list of all VMCSs
47 * loaded on this CPU (so we can clear them if the CPU goes down).
48 */
49struct loaded_vmcs {
50 struct vmcs *vmcs;
51 struct vmcs *shadow_vmcs;
52 int cpu;
53 bool launched;
54 bool nmi_known_unmasked;
55 bool hv_timer_armed;
56 /* Support for vnmi-less CPUs */
57 int soft_vnmi_blocked;
58 ktime_t entry_time;
59 s64 vnmi_blocked_time;
60 unsigned long *msr_bitmap;
61 struct list_head loaded_vmcss_on_cpu_link;
62 struct vmcs_host_state host_state;
63};
64
65static inline bool is_exception_n(u32 intr_info, u8 vector)
66{
67 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
68 INTR_INFO_VALID_MASK)) ==
69 (INTR_TYPE_HARD_EXCEPTION | vector | INTR_INFO_VALID_MASK);
70}
71
72static inline bool is_debug(u32 intr_info)
73{
74 return is_exception_n(intr_info, DB_VECTOR);
75}
76
77static inline bool is_breakpoint(u32 intr_info)
78{
79 return is_exception_n(intr_info, BP_VECTOR);
80}
81
82static inline bool is_page_fault(u32 intr_info)
83{
84 return is_exception_n(intr_info, PF_VECTOR);
85}
86
87static inline bool is_invalid_opcode(u32 intr_info)
88{
89 return is_exception_n(intr_info, UD_VECTOR);
90}
91
92static inline bool is_gp_fault(u32 intr_info)
93{
94 return is_exception_n(intr_info, GP_VECTOR);
95}
96
97static inline bool is_machine_check(u32 intr_info)
98{
99 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
100 INTR_INFO_VALID_MASK)) ==
101 (INTR_TYPE_HARD_EXCEPTION | MC_VECTOR | INTR_INFO_VALID_MASK);
102}
103
104/* Undocumented: icebp/int1 */
105static inline bool is_icebp(u32 intr_info)
106{
107 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
108 == (INTR_TYPE_PRIV_SW_EXCEPTION | INTR_INFO_VALID_MASK);
109}
110
111static inline bool is_nmi(u32 intr_info)
112{
113 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
114 == (INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK);
115}
116
117enum vmcs_field_width {
118 VMCS_FIELD_WIDTH_U16 = 0,
119 VMCS_FIELD_WIDTH_U64 = 1,
120 VMCS_FIELD_WIDTH_U32 = 2,
121 VMCS_FIELD_WIDTH_NATURAL_WIDTH = 3
122};
123
124static inline int vmcs_field_width(unsigned long field)
125{
126 if (0x1 & field) /* the *_HIGH fields are all 32 bit */
127 return VMCS_FIELD_WIDTH_U32;
128 return (field >> 13) & 0x3;
129}
130
131static inline int vmcs_field_readonly(unsigned long field)
132{
133 return (((field >> 10) & 0x3) == 1);
134}
135
136#endif /* __KVM_X86_VMX_VMCS_H */
diff --git a/arch/x86/kvm/vmx/vmcs12.c b/arch/x86/kvm/vmx/vmcs12.c
new file mode 100644
index 000000000000..53dfb401316d
--- /dev/null
+++ b/arch/x86/kvm/vmx/vmcs12.c
@@ -0,0 +1,157 @@
1// SPDX-License-Identifier: GPL-2.0
2
3#include "vmcs12.h"
4
5#define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n)))))
6#define VMCS12_OFFSET(x) offsetof(struct vmcs12, x)
7#define FIELD(number, name) [ROL16(number, 6)] = VMCS12_OFFSET(name)
8#define FIELD64(number, name) \
9 FIELD(number, name), \
10 [ROL16(number##_HIGH, 6)] = VMCS12_OFFSET(name) + sizeof(u32)
11
12const unsigned short vmcs_field_to_offset_table[] = {
13 FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id),
14 FIELD(POSTED_INTR_NV, posted_intr_nv),
15 FIELD(GUEST_ES_SELECTOR, guest_es_selector),
16 FIELD(GUEST_CS_SELECTOR, guest_cs_selector),
17 FIELD(GUEST_SS_SELECTOR, guest_ss_selector),
18 FIELD(GUEST_DS_SELECTOR, guest_ds_selector),
19 FIELD(GUEST_FS_SELECTOR, guest_fs_selector),
20 FIELD(GUEST_GS_SELECTOR, guest_gs_selector),
21 FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector),
22 FIELD(GUEST_TR_SELECTOR, guest_tr_selector),
23 FIELD(GUEST_INTR_STATUS, guest_intr_status),
24 FIELD(GUEST_PML_INDEX, guest_pml_index),
25 FIELD(HOST_ES_SELECTOR, host_es_selector),
26 FIELD(HOST_CS_SELECTOR, host_cs_selector),
27 FIELD(HOST_SS_SELECTOR, host_ss_selector),
28 FIELD(HOST_DS_SELECTOR, host_ds_selector),
29 FIELD(HOST_FS_SELECTOR, host_fs_selector),
30 FIELD(HOST_GS_SELECTOR, host_gs_selector),
31 FIELD(HOST_TR_SELECTOR, host_tr_selector),
32 FIELD64(IO_BITMAP_A, io_bitmap_a),
33 FIELD64(IO_BITMAP_B, io_bitmap_b),
34 FIELD64(MSR_BITMAP, msr_bitmap),
35 FIELD64(VM_EXIT_MSR_STORE_ADDR, vm_exit_msr_store_addr),
36 FIELD64(VM_EXIT_MSR_LOAD_ADDR, vm_exit_msr_load_addr),
37 FIELD64(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr),
38 FIELD64(PML_ADDRESS, pml_address),
39 FIELD64(TSC_OFFSET, tsc_offset),
40 FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr),
41 FIELD64(APIC_ACCESS_ADDR, apic_access_addr),
42 FIELD64(POSTED_INTR_DESC_ADDR, posted_intr_desc_addr),
43 FIELD64(VM_FUNCTION_CONTROL, vm_function_control),
44 FIELD64(EPT_POINTER, ept_pointer),
45 FIELD64(EOI_EXIT_BITMAP0, eoi_exit_bitmap0),
46 FIELD64(EOI_EXIT_BITMAP1, eoi_exit_bitmap1),
47 FIELD64(EOI_EXIT_BITMAP2, eoi_exit_bitmap2),
48 FIELD64(EOI_EXIT_BITMAP3, eoi_exit_bitmap3),
49 FIELD64(EPTP_LIST_ADDRESS, eptp_list_address),
50 FIELD64(VMREAD_BITMAP, vmread_bitmap),
51 FIELD64(VMWRITE_BITMAP, vmwrite_bitmap),
52 FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap),
53 FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address),
54 FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer),
55 FIELD64(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl),
56 FIELD64(GUEST_IA32_PAT, guest_ia32_pat),
57 FIELD64(GUEST_IA32_EFER, guest_ia32_efer),
58 FIELD64(GUEST_IA32_PERF_GLOBAL_CTRL, guest_ia32_perf_global_ctrl),
59 FIELD64(GUEST_PDPTR0, guest_pdptr0),
60 FIELD64(GUEST_PDPTR1, guest_pdptr1),
61 FIELD64(GUEST_PDPTR2, guest_pdptr2),
62 FIELD64(GUEST_PDPTR3, guest_pdptr3),
63 FIELD64(GUEST_BNDCFGS, guest_bndcfgs),
64 FIELD64(HOST_IA32_PAT, host_ia32_pat),
65 FIELD64(HOST_IA32_EFER, host_ia32_efer),
66 FIELD64(HOST_IA32_PERF_GLOBAL_CTRL, host_ia32_perf_global_ctrl),
67 FIELD(PIN_BASED_VM_EXEC_CONTROL, pin_based_vm_exec_control),
68 FIELD(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control),
69 FIELD(EXCEPTION_BITMAP, exception_bitmap),
70 FIELD(PAGE_FAULT_ERROR_CODE_MASK, page_fault_error_code_mask),
71 FIELD(PAGE_FAULT_ERROR_CODE_MATCH, page_fault_error_code_match),
72 FIELD(CR3_TARGET_COUNT, cr3_target_count),
73 FIELD(VM_EXIT_CONTROLS, vm_exit_controls),
74 FIELD(VM_EXIT_MSR_STORE_COUNT, vm_exit_msr_store_count),
75 FIELD(VM_EXIT_MSR_LOAD_COUNT, vm_exit_msr_load_count),
76 FIELD(VM_ENTRY_CONTROLS, vm_entry_controls),
77 FIELD(VM_ENTRY_MSR_LOAD_COUNT, vm_entry_msr_load_count),
78 FIELD(VM_ENTRY_INTR_INFO_FIELD, vm_entry_intr_info_field),
79 FIELD(VM_ENTRY_EXCEPTION_ERROR_CODE, vm_entry_exception_error_code),
80 FIELD(VM_ENTRY_INSTRUCTION_LEN, vm_entry_instruction_len),
81 FIELD(TPR_THRESHOLD, tpr_threshold),
82 FIELD(SECONDARY_VM_EXEC_CONTROL, secondary_vm_exec_control),
83 FIELD(VM_INSTRUCTION_ERROR, vm_instruction_error),
84 FIELD(VM_EXIT_REASON, vm_exit_reason),
85 FIELD(VM_EXIT_INTR_INFO, vm_exit_intr_info),
86 FIELD(VM_EXIT_INTR_ERROR_CODE, vm_exit_intr_error_code),
87 FIELD(IDT_VECTORING_INFO_FIELD, idt_vectoring_info_field),
88 FIELD(IDT_VECTORING_ERROR_CODE, idt_vectoring_error_code),
89 FIELD(VM_EXIT_INSTRUCTION_LEN, vm_exit_instruction_len),
90 FIELD(VMX_INSTRUCTION_INFO, vmx_instruction_info),
91 FIELD(GUEST_ES_LIMIT, guest_es_limit),
92 FIELD(GUEST_CS_LIMIT, guest_cs_limit),
93 FIELD(GUEST_SS_LIMIT, guest_ss_limit),
94 FIELD(GUEST_DS_LIMIT, guest_ds_limit),
95 FIELD(GUEST_FS_LIMIT, guest_fs_limit),
96 FIELD(GUEST_GS_LIMIT, guest_gs_limit),
97 FIELD(GUEST_LDTR_LIMIT, guest_ldtr_limit),
98 FIELD(GUEST_TR_LIMIT, guest_tr_limit),
99 FIELD(GUEST_GDTR_LIMIT, guest_gdtr_limit),
100 FIELD(GUEST_IDTR_LIMIT, guest_idtr_limit),
101 FIELD(GUEST_ES_AR_BYTES, guest_es_ar_bytes),
102 FIELD(GUEST_CS_AR_BYTES, guest_cs_ar_bytes),
103 FIELD(GUEST_SS_AR_BYTES, guest_ss_ar_bytes),
104 FIELD(GUEST_DS_AR_BYTES, guest_ds_ar_bytes),
105 FIELD(GUEST_FS_AR_BYTES, guest_fs_ar_bytes),
106 FIELD(GUEST_GS_AR_BYTES, guest_gs_ar_bytes),
107 FIELD(GUEST_LDTR_AR_BYTES, guest_ldtr_ar_bytes),
108 FIELD(GUEST_TR_AR_BYTES, guest_tr_ar_bytes),
109 FIELD(GUEST_INTERRUPTIBILITY_INFO, guest_interruptibility_info),
110 FIELD(GUEST_ACTIVITY_STATE, guest_activity_state),
111 FIELD(GUEST_SYSENTER_CS, guest_sysenter_cs),
112 FIELD(HOST_IA32_SYSENTER_CS, host_ia32_sysenter_cs),
113 FIELD(VMX_PREEMPTION_TIMER_VALUE, vmx_preemption_timer_value),
114 FIELD(CR0_GUEST_HOST_MASK, cr0_guest_host_mask),
115 FIELD(CR4_GUEST_HOST_MASK, cr4_guest_host_mask),
116 FIELD(CR0_READ_SHADOW, cr0_read_shadow),
117 FIELD(CR4_READ_SHADOW, cr4_read_shadow),
118 FIELD(CR3_TARGET_VALUE0, cr3_target_value0),
119 FIELD(CR3_TARGET_VALUE1, cr3_target_value1),
120 FIELD(CR3_TARGET_VALUE2, cr3_target_value2),
121 FIELD(CR3_TARGET_VALUE3, cr3_target_value3),
122 FIELD(EXIT_QUALIFICATION, exit_qualification),
123 FIELD(GUEST_LINEAR_ADDRESS, guest_linear_address),
124 FIELD(GUEST_CR0, guest_cr0),
125 FIELD(GUEST_CR3, guest_cr3),
126 FIELD(GUEST_CR4, guest_cr4),
127 FIELD(GUEST_ES_BASE, guest_es_base),
128 FIELD(GUEST_CS_BASE, guest_cs_base),
129 FIELD(GUEST_SS_BASE, guest_ss_base),
130 FIELD(GUEST_DS_BASE, guest_ds_base),
131 FIELD(GUEST_FS_BASE, guest_fs_base),
132 FIELD(GUEST_GS_BASE, guest_gs_base),
133 FIELD(GUEST_LDTR_BASE, guest_ldtr_base),
134 FIELD(GUEST_TR_BASE, guest_tr_base),
135 FIELD(GUEST_GDTR_BASE, guest_gdtr_base),
136 FIELD(GUEST_IDTR_BASE, guest_idtr_base),
137 FIELD(GUEST_DR7, guest_dr7),
138 FIELD(GUEST_RSP, guest_rsp),
139 FIELD(GUEST_RIP, guest_rip),
140 FIELD(GUEST_RFLAGS, guest_rflags),
141 FIELD(GUEST_PENDING_DBG_EXCEPTIONS, guest_pending_dbg_exceptions),
142 FIELD(GUEST_SYSENTER_ESP, guest_sysenter_esp),
143 FIELD(GUEST_SYSENTER_EIP, guest_sysenter_eip),
144 FIELD(HOST_CR0, host_cr0),
145 FIELD(HOST_CR3, host_cr3),
146 FIELD(HOST_CR4, host_cr4),
147 FIELD(HOST_FS_BASE, host_fs_base),
148 FIELD(HOST_GS_BASE, host_gs_base),
149 FIELD(HOST_TR_BASE, host_tr_base),
150 FIELD(HOST_GDTR_BASE, host_gdtr_base),
151 FIELD(HOST_IDTR_BASE, host_idtr_base),
152 FIELD(HOST_IA32_SYSENTER_ESP, host_ia32_sysenter_esp),
153 FIELD(HOST_IA32_SYSENTER_EIP, host_ia32_sysenter_eip),
154 FIELD(HOST_RSP, host_rsp),
155 FIELD(HOST_RIP, host_rip),
156};
157const unsigned int nr_vmcs12_fields = ARRAY_SIZE(vmcs_field_to_offset_table);
diff --git a/arch/x86/kvm/vmx/vmcs12.h b/arch/x86/kvm/vmx/vmcs12.h
new file mode 100644
index 000000000000..3a742428ad17
--- /dev/null
+++ b/arch/x86/kvm/vmx/vmcs12.h
@@ -0,0 +1,462 @@
1/* SPDX-License-Identifier: GPL-2.0 */
2#ifndef __KVM_X86_VMX_VMCS12_H
3#define __KVM_X86_VMX_VMCS12_H
4
5#include <linux/build_bug.h>
6
7#include "vmcs.h"
8
9/*
10 * struct vmcs12 describes the state that our guest hypervisor (L1) keeps for a
11 * single nested guest (L2), hence the name vmcs12. Any VMX implementation has
12 * a VMCS structure, and vmcs12 is our emulated VMX's VMCS. This structure is
13 * stored in guest memory specified by VMPTRLD, but is opaque to the guest,
14 * which must access it using VMREAD/VMWRITE/VMCLEAR instructions.
15 * More than one of these structures may exist, if L1 runs multiple L2 guests.
16 * nested_vmx_run() will use the data here to build the vmcs02: a VMCS for the
17 * underlying hardware which will be used to run L2.
18 * This structure is packed to ensure that its layout is identical across
19 * machines (necessary for live migration).
20 *
21 * IMPORTANT: Changing the layout of existing fields in this structure
22 * will break save/restore compatibility with older kvm releases. When
23 * adding new fields, either use space in the reserved padding* arrays
24 * or add the new fields to the end of the structure.
25 */
26typedef u64 natural_width;
27struct __packed vmcs12 {
28 /* According to the Intel spec, a VMCS region must start with the
29 * following two fields. Then follow implementation-specific data.
30 */
31 struct vmcs_hdr hdr;
32 u32 abort;
33
34 u32 launch_state; /* set to 0 by VMCLEAR, to 1 by VMLAUNCH */
35 u32 padding[7]; /* room for future expansion */
36
37 u64 io_bitmap_a;
38 u64 io_bitmap_b;
39 u64 msr_bitmap;
40 u64 vm_exit_msr_store_addr;
41 u64 vm_exit_msr_load_addr;
42 u64 vm_entry_msr_load_addr;
43 u64 tsc_offset;
44 u64 virtual_apic_page_addr;
45 u64 apic_access_addr;
46 u64 posted_intr_desc_addr;
47 u64 ept_pointer;
48 u64 eoi_exit_bitmap0;
49 u64 eoi_exit_bitmap1;
50 u64 eoi_exit_bitmap2;
51 u64 eoi_exit_bitmap3;
52 u64 xss_exit_bitmap;
53 u64 guest_physical_address;
54 u64 vmcs_link_pointer;
55 u64 guest_ia32_debugctl;
56 u64 guest_ia32_pat;
57 u64 guest_ia32_efer;
58 u64 guest_ia32_perf_global_ctrl;
59 u64 guest_pdptr0;
60 u64 guest_pdptr1;
61 u64 guest_pdptr2;
62 u64 guest_pdptr3;
63 u64 guest_bndcfgs;
64 u64 host_ia32_pat;
65 u64 host_ia32_efer;
66 u64 host_ia32_perf_global_ctrl;
67 u64 vmread_bitmap;
68 u64 vmwrite_bitmap;
69 u64 vm_function_control;
70 u64 eptp_list_address;
71 u64 pml_address;
72 u64 padding64[3]; /* room for future expansion */
73 /*
74 * To allow migration of L1 (complete with its L2 guests) between
75 * machines of different natural widths (32 or 64 bit), we cannot have
76 * unsigned long fields with no explicit size. We use u64 (aliased
77 * natural_width) instead. Luckily, x86 is little-endian.
78 */
79 natural_width cr0_guest_host_mask;
80 natural_width cr4_guest_host_mask;
81 natural_width cr0_read_shadow;
82 natural_width cr4_read_shadow;
83 natural_width cr3_target_value0;
84 natural_width cr3_target_value1;
85 natural_width cr3_target_value2;
86 natural_width cr3_target_value3;
87 natural_width exit_qualification;
88 natural_width guest_linear_address;
89 natural_width guest_cr0;
90 natural_width guest_cr3;
91 natural_width guest_cr4;
92 natural_width guest_es_base;
93 natural_width guest_cs_base;
94 natural_width guest_ss_base;
95 natural_width guest_ds_base;
96 natural_width guest_fs_base;
97 natural_width guest_gs_base;
98 natural_width guest_ldtr_base;
99 natural_width guest_tr_base;
100 natural_width guest_gdtr_base;
101 natural_width guest_idtr_base;
102 natural_width guest_dr7;
103 natural_width guest_rsp;
104 natural_width guest_rip;
105 natural_width guest_rflags;
106 natural_width guest_pending_dbg_exceptions;
107 natural_width guest_sysenter_esp;
108 natural_width guest_sysenter_eip;
109 natural_width host_cr0;
110 natural_width host_cr3;
111 natural_width host_cr4;
112 natural_width host_fs_base;
113 natural_width host_gs_base;
114 natural_width host_tr_base;
115 natural_width host_gdtr_base;
116 natural_width host_idtr_base;
117 natural_width host_ia32_sysenter_esp;
118 natural_width host_ia32_sysenter_eip;
119 natural_width host_rsp;
120 natural_width host_rip;
121 natural_width paddingl[8]; /* room for future expansion */
122 u32 pin_based_vm_exec_control;
123 u32 cpu_based_vm_exec_control;
124 u32 exception_bitmap;
125 u32 page_fault_error_code_mask;
126 u32 page_fault_error_code_match;
127 u32 cr3_target_count;
128 u32 vm_exit_controls;
129 u32 vm_exit_msr_store_count;
130 u32 vm_exit_msr_load_count;
131 u32 vm_entry_controls;
132 u32 vm_entry_msr_load_count;
133 u32 vm_entry_intr_info_field;
134 u32 vm_entry_exception_error_code;
135 u32 vm_entry_instruction_len;
136 u32 tpr_threshold;
137 u32 secondary_vm_exec_control;
138 u32 vm_instruction_error;
139 u32 vm_exit_reason;
140 u32 vm_exit_intr_info;
141 u32 vm_exit_intr_error_code;
142 u32 idt_vectoring_info_field;
143 u32 idt_vectoring_error_code;
144 u32 vm_exit_instruction_len;
145 u32 vmx_instruction_info;
146 u32 guest_es_limit;
147 u32 guest_cs_limit;
148 u32 guest_ss_limit;
149 u32 guest_ds_limit;
150 u32 guest_fs_limit;
151 u32 guest_gs_limit;
152 u32 guest_ldtr_limit;
153 u32 guest_tr_limit;
154 u32 guest_gdtr_limit;
155 u32 guest_idtr_limit;
156 u32 guest_es_ar_bytes;
157 u32 guest_cs_ar_bytes;
158 u32 guest_ss_ar_bytes;
159 u32 guest_ds_ar_bytes;
160 u32 guest_fs_ar_bytes;
161 u32 guest_gs_ar_bytes;
162 u32 guest_ldtr_ar_bytes;
163 u32 guest_tr_ar_bytes;
164 u32 guest_interruptibility_info;
165 u32 guest_activity_state;
166 u32 guest_sysenter_cs;
167 u32 host_ia32_sysenter_cs;
168 u32 vmx_preemption_timer_value;
169 u32 padding32[7]; /* room for future expansion */
170 u16 virtual_processor_id;
171 u16 posted_intr_nv;
172 u16 guest_es_selector;
173 u16 guest_cs_selector;
174 u16 guest_ss_selector;
175 u16 guest_ds_selector;
176 u16 guest_fs_selector;
177 u16 guest_gs_selector;
178 u16 guest_ldtr_selector;
179 u16 guest_tr_selector;
180 u16 guest_intr_status;
181 u16 host_es_selector;
182 u16 host_cs_selector;
183 u16 host_ss_selector;
184 u16 host_ds_selector;
185 u16 host_fs_selector;
186 u16 host_gs_selector;
187 u16 host_tr_selector;
188 u16 guest_pml_index;
189};
190
191/*
192 * VMCS12_REVISION is an arbitrary id that should be changed if the content or
193 * layout of struct vmcs12 is changed. MSR_IA32_VMX_BASIC returns this id, and
194 * VMPTRLD verifies that the VMCS region that L1 is loading contains this id.
195 *
196 * IMPORTANT: Changing this value will break save/restore compatibility with
197 * older kvm releases.
198 */
199#define VMCS12_REVISION 0x11e57ed0
200
201/*
202 * VMCS12_SIZE is the number of bytes L1 should allocate for the VMXON region
203 * and any VMCS region. Although only sizeof(struct vmcs12) are used by the
204 * current implementation, 4K are reserved to avoid future complications.
205 */
206#define VMCS12_SIZE 0x1000
207
208/*
209 * VMCS12_MAX_FIELD_INDEX is the highest index value used in any
210 * supported VMCS12 field encoding.
211 */
212#define VMCS12_MAX_FIELD_INDEX 0x17
213
214/*
215 * For save/restore compatibility, the vmcs12 field offsets must not change.
216 */
217#define CHECK_OFFSET(field, loc) \
218 BUILD_BUG_ON_MSG(offsetof(struct vmcs12, field) != (loc), \
219 "Offset of " #field " in struct vmcs12 has changed.")
220
221static inline void vmx_check_vmcs12_offsets(void)
222{
223 CHECK_OFFSET(hdr, 0);
224 CHECK_OFFSET(abort, 4);
225 CHECK_OFFSET(launch_state, 8);
226 CHECK_OFFSET(io_bitmap_a, 40);
227 CHECK_OFFSET(io_bitmap_b, 48);
228 CHECK_OFFSET(msr_bitmap, 56);
229 CHECK_OFFSET(vm_exit_msr_store_addr, 64);
230 CHECK_OFFSET(vm_exit_msr_load_addr, 72);
231 CHECK_OFFSET(vm_entry_msr_load_addr, 80);
232 CHECK_OFFSET(tsc_offset, 88);
233 CHECK_OFFSET(virtual_apic_page_addr, 96);
234 CHECK_OFFSET(apic_access_addr, 104);
235 CHECK_OFFSET(posted_intr_desc_addr, 112);
236 CHECK_OFFSET(ept_pointer, 120);
237 CHECK_OFFSET(eoi_exit_bitmap0, 128);
238 CHECK_OFFSET(eoi_exit_bitmap1, 136);
239 CHECK_OFFSET(eoi_exit_bitmap2, 144);
240 CHECK_OFFSET(eoi_exit_bitmap3, 152);
241 CHECK_OFFSET(xss_exit_bitmap, 160);
242 CHECK_OFFSET(guest_physical_address, 168);
243 CHECK_OFFSET(vmcs_link_pointer, 176);
244 CHECK_OFFSET(guest_ia32_debugctl, 184);
245 CHECK_OFFSET(guest_ia32_pat, 192);
246 CHECK_OFFSET(guest_ia32_efer, 200);
247 CHECK_OFFSET(guest_ia32_perf_global_ctrl, 208);
248 CHECK_OFFSET(guest_pdptr0, 216);
249 CHECK_OFFSET(guest_pdptr1, 224);
250 CHECK_OFFSET(guest_pdptr2, 232);
251 CHECK_OFFSET(guest_pdptr3, 240);
252 CHECK_OFFSET(guest_bndcfgs, 248);
253 CHECK_OFFSET(host_ia32_pat, 256);
254 CHECK_OFFSET(host_ia32_efer, 264);
255 CHECK_OFFSET(host_ia32_perf_global_ctrl, 272);
256 CHECK_OFFSET(vmread_bitmap, 280);
257 CHECK_OFFSET(vmwrite_bitmap, 288);
258 CHECK_OFFSET(vm_function_control, 296);
259 CHECK_OFFSET(eptp_list_address, 304);
260 CHECK_OFFSET(pml_address, 312);
261 CHECK_OFFSET(cr0_guest_host_mask, 344);
262 CHECK_OFFSET(cr4_guest_host_mask, 352);
263 CHECK_OFFSET(cr0_read_shadow, 360);
264 CHECK_OFFSET(cr4_read_shadow, 368);
265 CHECK_OFFSET(cr3_target_value0, 376);
266 CHECK_OFFSET(cr3_target_value1, 384);
267 CHECK_OFFSET(cr3_target_value2, 392);
268 CHECK_OFFSET(cr3_target_value3, 400);
269 CHECK_OFFSET(exit_qualification, 408);
270 CHECK_OFFSET(guest_linear_address, 416);
271 CHECK_OFFSET(guest_cr0, 424);
272 CHECK_OFFSET(guest_cr3, 432);
273 CHECK_OFFSET(guest_cr4, 440);
274 CHECK_OFFSET(guest_es_base, 448);
275 CHECK_OFFSET(guest_cs_base, 456);
276 CHECK_OFFSET(guest_ss_base, 464);
277 CHECK_OFFSET(guest_ds_base, 472);
278 CHECK_OFFSET(guest_fs_base, 480);
279 CHECK_OFFSET(guest_gs_base, 488);
280 CHECK_OFFSET(guest_ldtr_base, 496);
281 CHECK_OFFSET(guest_tr_base, 504);
282 CHECK_OFFSET(guest_gdtr_base, 512);
283 CHECK_OFFSET(guest_idtr_base, 520);
284 CHECK_OFFSET(guest_dr7, 528);
285 CHECK_OFFSET(guest_rsp, 536);
286 CHECK_OFFSET(guest_rip, 544);
287 CHECK_OFFSET(guest_rflags, 552);
288 CHECK_OFFSET(guest_pending_dbg_exceptions, 560);
289 CHECK_OFFSET(guest_sysenter_esp, 568);
290 CHECK_OFFSET(guest_sysenter_eip, 576);
291 CHECK_OFFSET(host_cr0, 584);
292 CHECK_OFFSET(host_cr3, 592);
293 CHECK_OFFSET(host_cr4, 600);
294 CHECK_OFFSET(host_fs_base, 608);
295 CHECK_OFFSET(host_gs_base, 616);
296 CHECK_OFFSET(host_tr_base, 624);
297 CHECK_OFFSET(host_gdtr_base, 632);
298 CHECK_OFFSET(host_idtr_base, 640);
299 CHECK_OFFSET(host_ia32_sysenter_esp, 648);
300 CHECK_OFFSET(host_ia32_sysenter_eip, 656);
301 CHECK_OFFSET(host_rsp, 664);
302 CHECK_OFFSET(host_rip, 672);
303 CHECK_OFFSET(pin_based_vm_exec_control, 744);
304 CHECK_OFFSET(cpu_based_vm_exec_control, 748);
305 CHECK_OFFSET(exception_bitmap, 752);
306 CHECK_OFFSET(page_fault_error_code_mask, 756);
307 CHECK_OFFSET(page_fault_error_code_match, 760);
308 CHECK_OFFSET(cr3_target_count, 764);
309 CHECK_OFFSET(vm_exit_controls, 768);
310 CHECK_OFFSET(vm_exit_msr_store_count, 772);
311 CHECK_OFFSET(vm_exit_msr_load_count, 776);
312 CHECK_OFFSET(vm_entry_controls, 780);
313 CHECK_OFFSET(vm_entry_msr_load_count, 784);
314 CHECK_OFFSET(vm_entry_intr_info_field, 788);
315 CHECK_OFFSET(vm_entry_exception_error_code, 792);
316 CHECK_OFFSET(vm_entry_instruction_len, 796);
317 CHECK_OFFSET(tpr_threshold, 800);
318 CHECK_OFFSET(secondary_vm_exec_control, 804);
319 CHECK_OFFSET(vm_instruction_error, 808);
320 CHECK_OFFSET(vm_exit_reason, 812);
321 CHECK_OFFSET(vm_exit_intr_info, 816);
322 CHECK_OFFSET(vm_exit_intr_error_code, 820);
323 CHECK_OFFSET(idt_vectoring_info_field, 824);
324 CHECK_OFFSET(idt_vectoring_error_code, 828);
325 CHECK_OFFSET(vm_exit_instruction_len, 832);
326 CHECK_OFFSET(vmx_instruction_info, 836);
327 CHECK_OFFSET(guest_es_limit, 840);
328 CHECK_OFFSET(guest_cs_limit, 844);
329 CHECK_OFFSET(guest_ss_limit, 848);
330 CHECK_OFFSET(guest_ds_limit, 852);
331 CHECK_OFFSET(guest_fs_limit, 856);
332 CHECK_OFFSET(guest_gs_limit, 860);
333 CHECK_OFFSET(guest_ldtr_limit, 864);
334 CHECK_OFFSET(guest_tr_limit, 868);
335 CHECK_OFFSET(guest_gdtr_limit, 872);
336 CHECK_OFFSET(guest_idtr_limit, 876);
337 CHECK_OFFSET(guest_es_ar_bytes, 880);
338 CHECK_OFFSET(guest_cs_ar_bytes, 884);
339 CHECK_OFFSET(guest_ss_ar_bytes, 888);
340 CHECK_OFFSET(guest_ds_ar_bytes, 892);
341 CHECK_OFFSET(guest_fs_ar_bytes, 896);
342 CHECK_OFFSET(guest_gs_ar_bytes, 900);
343 CHECK_OFFSET(guest_ldtr_ar_bytes, 904);
344 CHECK_OFFSET(guest_tr_ar_bytes, 908);
345 CHECK_OFFSET(guest_interruptibility_info, 912);
346 CHECK_OFFSET(guest_activity_state, 916);
347 CHECK_OFFSET(guest_sysenter_cs, 920);
348 CHECK_OFFSET(host_ia32_sysenter_cs, 924);
349 CHECK_OFFSET(vmx_preemption_timer_value, 928);
350 CHECK_OFFSET(virtual_processor_id, 960);
351 CHECK_OFFSET(posted_intr_nv, 962);
352 CHECK_OFFSET(guest_es_selector, 964);
353 CHECK_OFFSET(guest_cs_selector, 966);
354 CHECK_OFFSET(guest_ss_selector, 968);
355 CHECK_OFFSET(guest_ds_selector, 970);
356 CHECK_OFFSET(guest_fs_selector, 972);
357 CHECK_OFFSET(guest_gs_selector, 974);
358 CHECK_OFFSET(guest_ldtr_selector, 976);
359 CHECK_OFFSET(guest_tr_selector, 978);
360 CHECK_OFFSET(guest_intr_status, 980);
361 CHECK_OFFSET(host_es_selector, 982);
362 CHECK_OFFSET(host_cs_selector, 984);
363 CHECK_OFFSET(host_ss_selector, 986);
364 CHECK_OFFSET(host_ds_selector, 988);
365 CHECK_OFFSET(host_fs_selector, 990);
366 CHECK_OFFSET(host_gs_selector, 992);
367 CHECK_OFFSET(host_tr_selector, 994);
368 CHECK_OFFSET(guest_pml_index, 996);
369}
370
371extern const unsigned short vmcs_field_to_offset_table[];
372extern const unsigned int nr_vmcs12_fields;
373
374#define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n)))))
375
376static inline short vmcs_field_to_offset(unsigned long field)
377{
378 unsigned short offset;
379 unsigned int index;
380
381 if (field >> 15)
382 return -ENOENT;
383
384 index = ROL16(field, 6);
385 if (index >= nr_vmcs12_fields)
386 return -ENOENT;
387
388 index = array_index_nospec(index, nr_vmcs12_fields);
389 offset = vmcs_field_to_offset_table[index];
390 if (offset == 0)
391 return -ENOENT;
392 return offset;
393}
394
395#undef ROL16
396
397/*
398 * Read a vmcs12 field. Since these can have varying lengths and we return
399 * one type, we chose the biggest type (u64) and zero-extend the return value
400 * to that size. Note that the caller, handle_vmread, might need to use only
401 * some of the bits we return here (e.g., on 32-bit guests, only 32 bits of
402 * 64-bit fields are to be returned).
403 */
404static inline int vmcs12_read_any(struct vmcs12 *vmcs12,
405 unsigned long field, u64 *ret)
406{
407 short offset = vmcs_field_to_offset(field);
408 char *p;
409
410 if (offset < 0)
411 return offset;
412
413 p = (char *)vmcs12 + offset;
414
415 switch (vmcs_field_width(field)) {
416 case VMCS_FIELD_WIDTH_NATURAL_WIDTH:
417 *ret = *((natural_width *)p);
418 return 0;
419 case VMCS_FIELD_WIDTH_U16:
420 *ret = *((u16 *)p);
421 return 0;
422 case VMCS_FIELD_WIDTH_U32:
423 *ret = *((u32 *)p);
424 return 0;
425 case VMCS_FIELD_WIDTH_U64:
426 *ret = *((u64 *)p);
427 return 0;
428 default:
429 WARN_ON(1);
430 return -ENOENT;
431 }
432}
433
434static inline int vmcs12_write_any(struct vmcs12 *vmcs12,
435 unsigned long field, u64 field_value){
436 short offset = vmcs_field_to_offset(field);
437 char *p = (char *)vmcs12 + offset;
438
439 if (offset < 0)
440 return offset;
441
442 switch (vmcs_field_width(field)) {
443 case VMCS_FIELD_WIDTH_U16:
444 *(u16 *)p = field_value;
445 return 0;
446 case VMCS_FIELD_WIDTH_U32:
447 *(u32 *)p = field_value;
448 return 0;
449 case VMCS_FIELD_WIDTH_U64:
450 *(u64 *)p = field_value;
451 return 0;
452 case VMCS_FIELD_WIDTH_NATURAL_WIDTH:
453 *(natural_width *)p = field_value;
454 return 0;
455 default:
456 WARN_ON(1);
457 return -ENOENT;
458 }
459
460}
461
462#endif /* __KVM_X86_VMX_VMCS12_H */
diff --git a/arch/x86/kvm/vmx_shadow_fields.h b/arch/x86/kvm/vmx/vmcs_shadow_fields.h
index 132432f375c2..132432f375c2 100644
--- a/arch/x86/kvm/vmx_shadow_fields.h
+++ b/arch/x86/kvm/vmx/vmcs_shadow_fields.h
diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S
new file mode 100644
index 000000000000..bcef2c7e9bc4
--- /dev/null
+++ b/arch/x86/kvm/vmx/vmenter.S
@@ -0,0 +1,57 @@
1/* SPDX-License-Identifier: GPL-2.0 */
2#include <linux/linkage.h>
3#include <asm/asm.h>
4
5 .text
6
7/**
8 * vmx_vmenter - VM-Enter the current loaded VMCS
9 *
10 * %RFLAGS.ZF: !VMCS.LAUNCHED, i.e. controls VMLAUNCH vs. VMRESUME
11 *
12 * Returns:
13 * %RFLAGS.CF is set on VM-Fail Invalid
14 * %RFLAGS.ZF is set on VM-Fail Valid
15 * %RFLAGS.{CF,ZF} are cleared on VM-Success, i.e. VM-Exit
16 *
17 * Note that VMRESUME/VMLAUNCH fall-through and return directly if
18 * they VM-Fail, whereas a successful VM-Enter + VM-Exit will jump
19 * to vmx_vmexit.
20 */
21ENTRY(vmx_vmenter)
22 /* EFLAGS.ZF is set if VMCS.LAUNCHED == 0 */
23 je 2f
24
251: vmresume
26 ret
27
282: vmlaunch
29 ret
30
313: cmpb $0, kvm_rebooting
32 jne 4f
33 call kvm_spurious_fault
344: ret
35
36 .pushsection .fixup, "ax"
375: jmp 3b
38 .popsection
39
40 _ASM_EXTABLE(1b, 5b)
41 _ASM_EXTABLE(2b, 5b)
42
43ENDPROC(vmx_vmenter)
44
45/**
46 * vmx_vmexit - Handle a VMX VM-Exit
47 *
48 * Returns:
49 * %RFLAGS.{CF,ZF} are cleared on VM-Success, i.e. VM-Exit
50 *
51 * This is vmx_vmenter's partner in crime. On a VM-Exit, control will jump
52 * here after hardware loads the host's state, i.e. this is the destination
53 * referred to by VMCS.HOST_RIP.
54 */
55ENTRY(vmx_vmexit)
56 ret
57ENDPROC(vmx_vmexit)
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
new file mode 100644
index 000000000000..41d6f7081ff7
--- /dev/null
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -0,0 +1,7935 @@
1/*
2 * Kernel-based Virtual Machine driver for Linux
3 *
4 * This module enables machines with Intel VT-x extensions to run virtual
5 * machines without emulation or binary translation.
6 *
7 * Copyright (C) 2006 Qumranet, Inc.
8 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
9 *
10 * Authors:
11 * Avi Kivity <avi@qumranet.com>
12 * Yaniv Kamay <yaniv@qumranet.com>
13 *
14 * This work is licensed under the terms of the GNU GPL, version 2. See
15 * the COPYING file in the top-level directory.
16 *
17 */
18
19#include <linux/frame.h>
20#include <linux/highmem.h>
21#include <linux/hrtimer.h>
22#include <linux/kernel.h>
23#include <linux/kvm_host.h>
24#include <linux/module.h>
25#include <linux/moduleparam.h>
26#include <linux/mod_devicetable.h>
27#include <linux/mm.h>
28#include <linux/sched.h>
29#include <linux/slab.h>
30#include <linux/tboot.h>
31#include <linux/trace_events.h>
32
33#include <asm/apic.h>
34#include <asm/asm.h>
35#include <asm/cpu.h>
36#include <asm/debugreg.h>
37#include <asm/desc.h>
38#include <asm/fpu/internal.h>
39#include <asm/io.h>
40#include <asm/irq_remapping.h>
41#include <asm/kexec.h>
42#include <asm/perf_event.h>
43#include <asm/mce.h>
44#include <asm/mmu_context.h>
45#include <asm/mshyperv.h>
46#include <asm/spec-ctrl.h>
47#include <asm/virtext.h>
48#include <asm/vmx.h>
49
50#include "capabilities.h"
51#include "cpuid.h"
52#include "evmcs.h"
53#include "irq.h"
54#include "kvm_cache_regs.h"
55#include "lapic.h"
56#include "mmu.h"
57#include "nested.h"
58#include "ops.h"
59#include "pmu.h"
60#include "trace.h"
61#include "vmcs.h"
62#include "vmcs12.h"
63#include "vmx.h"
64#include "x86.h"
65
66MODULE_AUTHOR("Qumranet");
67MODULE_LICENSE("GPL");
68
69static const struct x86_cpu_id vmx_cpu_id[] = {
70 X86_FEATURE_MATCH(X86_FEATURE_VMX),
71 {}
72};
73MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id);
74
75bool __read_mostly enable_vpid = 1;
76module_param_named(vpid, enable_vpid, bool, 0444);
77
78static bool __read_mostly enable_vnmi = 1;
79module_param_named(vnmi, enable_vnmi, bool, S_IRUGO);
80
81bool __read_mostly flexpriority_enabled = 1;
82module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO);
83
84bool __read_mostly enable_ept = 1;
85module_param_named(ept, enable_ept, bool, S_IRUGO);
86
87bool __read_mostly enable_unrestricted_guest = 1;
88module_param_named(unrestricted_guest,
89 enable_unrestricted_guest, bool, S_IRUGO);
90
91bool __read_mostly enable_ept_ad_bits = 1;
92module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO);
93
94static bool __read_mostly emulate_invalid_guest_state = true;
95module_param(emulate_invalid_guest_state, bool, S_IRUGO);
96
97static bool __read_mostly fasteoi = 1;
98module_param(fasteoi, bool, S_IRUGO);
99
100static bool __read_mostly enable_apicv = 1;
101module_param(enable_apicv, bool, S_IRUGO);
102
103/*
104 * If nested=1, nested virtualization is supported, i.e., guests may use
105 * VMX and be a hypervisor for its own guests. If nested=0, guests may not
106 * use VMX instructions.
107 */
108static bool __read_mostly nested = 1;
109module_param(nested, bool, S_IRUGO);
110
111static u64 __read_mostly host_xss;
112
113bool __read_mostly enable_pml = 1;
114module_param_named(pml, enable_pml, bool, S_IRUGO);
115
116#define MSR_BITMAP_MODE_X2APIC 1
117#define MSR_BITMAP_MODE_X2APIC_APICV 2
118
119#define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL
120
121/* Guest_tsc -> host_tsc conversion requires 64-bit division. */
122static int __read_mostly cpu_preemption_timer_multi;
123static bool __read_mostly enable_preemption_timer = 1;
124#ifdef CONFIG_X86_64
125module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO);
126#endif
127
128#define KVM_VM_CR0_ALWAYS_OFF (X86_CR0_NW | X86_CR0_CD)
129#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE
130#define KVM_VM_CR0_ALWAYS_ON \
131 (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | \
132 X86_CR0_WP | X86_CR0_PG | X86_CR0_PE)
133#define KVM_CR4_GUEST_OWNED_BITS \
134 (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \
135 | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_TSD)
136
137#define KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR4_VMXE
138#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
139#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
140
141#define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM))
142
143#define MSR_IA32_RTIT_STATUS_MASK (~(RTIT_STATUS_FILTEREN | \
144 RTIT_STATUS_CONTEXTEN | RTIT_STATUS_TRIGGEREN | \
145 RTIT_STATUS_ERROR | RTIT_STATUS_STOPPED | \
146 RTIT_STATUS_BYTECNT))
147
148#define MSR_IA32_RTIT_OUTPUT_BASE_MASK \
149 (~((1UL << cpuid_query_maxphyaddr(vcpu)) - 1) | 0x7f)
150
151/*
152 * These 2 parameters are used to config the controls for Pause-Loop Exiting:
153 * ple_gap: upper bound on the amount of time between two successive
154 * executions of PAUSE in a loop. Also indicate if ple enabled.
155 * According to test, this time is usually smaller than 128 cycles.
156 * ple_window: upper bound on the amount of time a guest is allowed to execute
157 * in a PAUSE loop. Tests indicate that most spinlocks are held for
158 * less than 2^12 cycles
159 * Time is measured based on a counter that runs at the same rate as the TSC,
160 * refer SDM volume 3b section 21.6.13 & 22.1.3.
161 */
162static unsigned int ple_gap = KVM_DEFAULT_PLE_GAP;
163module_param(ple_gap, uint, 0444);
164
165static unsigned int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
166module_param(ple_window, uint, 0444);
167
168/* Default doubles per-vcpu window every exit. */
169static unsigned int ple_window_grow = KVM_DEFAULT_PLE_WINDOW_GROW;
170module_param(ple_window_grow, uint, 0444);
171
172/* Default resets per-vcpu window every exit to ple_window. */
173static unsigned int ple_window_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK;
174module_param(ple_window_shrink, uint, 0444);
175
176/* Default is to compute the maximum so we can never overflow. */
177static unsigned int ple_window_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX;
178module_param(ple_window_max, uint, 0444);
179
180/* Default is SYSTEM mode, 1 for host-guest mode */
181int __read_mostly pt_mode = PT_MODE_SYSTEM;
182module_param(pt_mode, int, S_IRUGO);
183
184static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush);
185static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond);
186static DEFINE_MUTEX(vmx_l1d_flush_mutex);
187
188/* Storage for pre module init parameter parsing */
189static enum vmx_l1d_flush_state __read_mostly vmentry_l1d_flush_param = VMENTER_L1D_FLUSH_AUTO;
190
191static const struct {
192 const char *option;
193 bool for_parse;
194} vmentry_l1d_param[] = {
195 [VMENTER_L1D_FLUSH_AUTO] = {"auto", true},
196 [VMENTER_L1D_FLUSH_NEVER] = {"never", true},
197 [VMENTER_L1D_FLUSH_COND] = {"cond", true},
198 [VMENTER_L1D_FLUSH_ALWAYS] = {"always", true},
199 [VMENTER_L1D_FLUSH_EPT_DISABLED] = {"EPT disabled", false},
200 [VMENTER_L1D_FLUSH_NOT_REQUIRED] = {"not required", false},
201};
202
203#define L1D_CACHE_ORDER 4
204static void *vmx_l1d_flush_pages;
205
206static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
207{
208 struct page *page;
209 unsigned int i;
210
211 if (!enable_ept) {
212 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED;
213 return 0;
214 }
215
216 if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) {
217 u64 msr;
218
219 rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr);
220 if (msr & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) {
221 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
222 return 0;
223 }
224 }
225
226 /* If set to auto use the default l1tf mitigation method */
227 if (l1tf == VMENTER_L1D_FLUSH_AUTO) {
228 switch (l1tf_mitigation) {
229 case L1TF_MITIGATION_OFF:
230 l1tf = VMENTER_L1D_FLUSH_NEVER;
231 break;
232 case L1TF_MITIGATION_FLUSH_NOWARN:
233 case L1TF_MITIGATION_FLUSH:
234 case L1TF_MITIGATION_FLUSH_NOSMT:
235 l1tf = VMENTER_L1D_FLUSH_COND;
236 break;
237 case L1TF_MITIGATION_FULL:
238 case L1TF_MITIGATION_FULL_FORCE:
239 l1tf = VMENTER_L1D_FLUSH_ALWAYS;
240 break;
241 }
242 } else if (l1tf_mitigation == L1TF_MITIGATION_FULL_FORCE) {
243 l1tf = VMENTER_L1D_FLUSH_ALWAYS;
244 }
245
246 if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages &&
247 !boot_cpu_has(X86_FEATURE_FLUSH_L1D)) {
248 page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER);
249 if (!page)
250 return -ENOMEM;
251 vmx_l1d_flush_pages = page_address(page);
252
253 /*
254 * Initialize each page with a different pattern in
255 * order to protect against KSM in the nested
256 * virtualization case.
257 */
258 for (i = 0; i < 1u << L1D_CACHE_ORDER; ++i) {
259 memset(vmx_l1d_flush_pages + i * PAGE_SIZE, i + 1,
260 PAGE_SIZE);
261 }
262 }
263
264 l1tf_vmx_mitigation = l1tf;
265
266 if (l1tf != VMENTER_L1D_FLUSH_NEVER)
267 static_branch_enable(&vmx_l1d_should_flush);
268 else
269 static_branch_disable(&vmx_l1d_should_flush);
270
271 if (l1tf == VMENTER_L1D_FLUSH_COND)
272 static_branch_enable(&vmx_l1d_flush_cond);
273 else
274 static_branch_disable(&vmx_l1d_flush_cond);
275 return 0;
276}
277
278static int vmentry_l1d_flush_parse(const char *s)
279{
280 unsigned int i;
281
282 if (s) {
283 for (i = 0; i < ARRAY_SIZE(vmentry_l1d_param); i++) {
284 if (vmentry_l1d_param[i].for_parse &&
285 sysfs_streq(s, vmentry_l1d_param[i].option))
286 return i;
287 }
288 }
289 return -EINVAL;
290}
291
292static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp)
293{
294 int l1tf, ret;
295
296 l1tf = vmentry_l1d_flush_parse(s);
297 if (l1tf < 0)
298 return l1tf;
299
300 if (!boot_cpu_has(X86_BUG_L1TF))
301 return 0;
302
303 /*
304 * Has vmx_init() run already? If not then this is the pre init
305 * parameter parsing. In that case just store the value and let
306 * vmx_init() do the proper setup after enable_ept has been
307 * established.
308 */
309 if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO) {
310 vmentry_l1d_flush_param = l1tf;
311 return 0;
312 }
313
314 mutex_lock(&vmx_l1d_flush_mutex);
315 ret = vmx_setup_l1d_flush(l1tf);
316 mutex_unlock(&vmx_l1d_flush_mutex);
317 return ret;
318}
319
320static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp)
321{
322 if (WARN_ON_ONCE(l1tf_vmx_mitigation >= ARRAY_SIZE(vmentry_l1d_param)))
323 return sprintf(s, "???\n");
324
325 return sprintf(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option);
326}
327
328static const struct kernel_param_ops vmentry_l1d_flush_ops = {
329 .set = vmentry_l1d_flush_set,
330 .get = vmentry_l1d_flush_get,
331};
332module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644);
333
334static bool guest_state_valid(struct kvm_vcpu *vcpu);
335static u32 vmx_segment_access_rights(struct kvm_segment *var);
336static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
337 u32 msr, int type);
338
339void vmx_vmexit(void);
340
341static DEFINE_PER_CPU(struct vmcs *, vmxarea);
342DEFINE_PER_CPU(struct vmcs *, current_vmcs);
343/*
344 * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed
345 * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it.
346 */
347static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
348
349/*
350 * We maintian a per-CPU linked-list of vCPU, so in wakeup_handler() we
351 * can find which vCPU should be waken up.
352 */
353static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu);
354static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock);
355
356static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
357static DEFINE_SPINLOCK(vmx_vpid_lock);
358
359struct vmcs_config vmcs_config;
360struct vmx_capability vmx_capability;
361
362#define VMX_SEGMENT_FIELD(seg) \
363 [VCPU_SREG_##seg] = { \
364 .selector = GUEST_##seg##_SELECTOR, \
365 .base = GUEST_##seg##_BASE, \
366 .limit = GUEST_##seg##_LIMIT, \
367 .ar_bytes = GUEST_##seg##_AR_BYTES, \
368 }
369
370static const struct kvm_vmx_segment_field {
371 unsigned selector;
372 unsigned base;
373 unsigned limit;
374 unsigned ar_bytes;
375} kvm_vmx_segment_fields[] = {
376 VMX_SEGMENT_FIELD(CS),
377 VMX_SEGMENT_FIELD(DS),
378 VMX_SEGMENT_FIELD(ES),
379 VMX_SEGMENT_FIELD(FS),
380 VMX_SEGMENT_FIELD(GS),
381 VMX_SEGMENT_FIELD(SS),
382 VMX_SEGMENT_FIELD(TR),
383 VMX_SEGMENT_FIELD(LDTR),
384};
385
386u64 host_efer;
387
388/*
389 * Though SYSCALL is only supported in 64-bit mode on Intel CPUs, kvm
390 * will emulate SYSCALL in legacy mode if the vendor string in guest
391 * CPUID.0:{EBX,ECX,EDX} is "AuthenticAMD" or "AMDisbetter!" To
392 * support this emulation, IA32_STAR must always be included in
393 * vmx_msr_index[], even in i386 builds.
394 */
395const u32 vmx_msr_index[] = {
396#ifdef CONFIG_X86_64
397 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
398#endif
399 MSR_EFER, MSR_TSC_AUX, MSR_STAR,
400};
401
402#if IS_ENABLED(CONFIG_HYPERV)
403static bool __read_mostly enlightened_vmcs = true;
404module_param(enlightened_vmcs, bool, 0444);
405
406/* check_ept_pointer() should be under protection of ept_pointer_lock. */
407static void check_ept_pointer_match(struct kvm *kvm)
408{
409 struct kvm_vcpu *vcpu;
410 u64 tmp_eptp = INVALID_PAGE;
411 int i;
412
413 kvm_for_each_vcpu(i, vcpu, kvm) {
414 if (!VALID_PAGE(tmp_eptp)) {
415 tmp_eptp = to_vmx(vcpu)->ept_pointer;
416 } else if (tmp_eptp != to_vmx(vcpu)->ept_pointer) {
417 to_kvm_vmx(kvm)->ept_pointers_match
418 = EPT_POINTERS_MISMATCH;
419 return;
420 }
421 }
422
423 to_kvm_vmx(kvm)->ept_pointers_match = EPT_POINTERS_MATCH;
424}
425
426int kvm_fill_hv_flush_list_func(struct hv_guest_mapping_flush_list *flush,
427 void *data)
428{
429 struct kvm_tlb_range *range = data;
430
431 return hyperv_fill_flush_guest_mapping_list(flush, range->start_gfn,
432 range->pages);
433}
434
435static inline int __hv_remote_flush_tlb_with_range(struct kvm *kvm,
436 struct kvm_vcpu *vcpu, struct kvm_tlb_range *range)
437{
438 u64 ept_pointer = to_vmx(vcpu)->ept_pointer;
439
440 /*
441 * FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE hypercall needs address
442 * of the base of EPT PML4 table, strip off EPT configuration
443 * information.
444 */
445 if (range)
446 return hyperv_flush_guest_mapping_range(ept_pointer & PAGE_MASK,
447 kvm_fill_hv_flush_list_func, (void *)range);
448 else
449 return hyperv_flush_guest_mapping(ept_pointer & PAGE_MASK);
450}
451
452static int hv_remote_flush_tlb_with_range(struct kvm *kvm,
453 struct kvm_tlb_range *range)
454{
455 struct kvm_vcpu *vcpu;
456 int ret = -ENOTSUPP, i;
457
458 spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock);
459
460 if (to_kvm_vmx(kvm)->ept_pointers_match == EPT_POINTERS_CHECK)
461 check_ept_pointer_match(kvm);
462
463 if (to_kvm_vmx(kvm)->ept_pointers_match != EPT_POINTERS_MATCH) {
464 kvm_for_each_vcpu(i, vcpu, kvm) {
465 /* If ept_pointer is invalid pointer, bypass flush request. */
466 if (VALID_PAGE(to_vmx(vcpu)->ept_pointer))
467 ret |= __hv_remote_flush_tlb_with_range(
468 kvm, vcpu, range);
469 }
470 } else {
471 ret = __hv_remote_flush_tlb_with_range(kvm,
472 kvm_get_vcpu(kvm, 0), range);
473 }
474
475 spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock);
476 return ret;
477}
478static int hv_remote_flush_tlb(struct kvm *kvm)
479{
480 return hv_remote_flush_tlb_with_range(kvm, NULL);
481}
482
483#endif /* IS_ENABLED(CONFIG_HYPERV) */
484
485/*
486 * Comment's format: document - errata name - stepping - processor name.
487 * Refer from
488 * https://www.virtualbox.org/svn/vbox/trunk/src/VBox/VMM/VMMR0/HMR0.cpp
489 */
490static u32 vmx_preemption_cpu_tfms[] = {
491/* 323344.pdf - BA86 - D0 - Xeon 7500 Series */
4920x000206E6,
493/* 323056.pdf - AAX65 - C2 - Xeon L3406 */
494/* 322814.pdf - AAT59 - C2 - i7-600, i5-500, i5-400 and i3-300 Mobile */
495/* 322911.pdf - AAU65 - C2 - i5-600, i3-500 Desktop and Pentium G6950 */
4960x00020652,
497/* 322911.pdf - AAU65 - K0 - i5-600, i3-500 Desktop and Pentium G6950 */
4980x00020655,
499/* 322373.pdf - AAO95 - B1 - Xeon 3400 Series */
500/* 322166.pdf - AAN92 - B1 - i7-800 and i5-700 Desktop */
501/*
502 * 320767.pdf - AAP86 - B1 -
503 * i7-900 Mobile Extreme, i7-800 and i7-700 Mobile
504 */
5050x000106E5,
506/* 321333.pdf - AAM126 - C0 - Xeon 3500 */
5070x000106A0,
508/* 321333.pdf - AAM126 - C1 - Xeon 3500 */
5090x000106A1,
510/* 320836.pdf - AAJ124 - C0 - i7-900 Desktop Extreme and i7-900 Desktop */
5110x000106A4,
512 /* 321333.pdf - AAM126 - D0 - Xeon 3500 */
513 /* 321324.pdf - AAK139 - D0 - Xeon 5500 */
514 /* 320836.pdf - AAJ124 - D0 - i7-900 Extreme and i7-900 Desktop */
5150x000106A5,
516 /* Xeon E3-1220 V2 */
5170x000306A8,
518};
519
520static inline bool cpu_has_broken_vmx_preemption_timer(void)
521{
522 u32 eax = cpuid_eax(0x00000001), i;
523
524 /* Clear the reserved bits */
525 eax &= ~(0x3U << 14 | 0xfU << 28);
526 for (i = 0; i < ARRAY_SIZE(vmx_preemption_cpu_tfms); i++)
527 if (eax == vmx_preemption_cpu_tfms[i])
528 return true;
529
530 return false;
531}
532
533static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu)
534{
535 return flexpriority_enabled && lapic_in_kernel(vcpu);
536}
537
538static inline bool report_flexpriority(void)
539{
540 return flexpriority_enabled;
541}
542
543static inline int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
544{
545 int i;
546
547 for (i = 0; i < vmx->nmsrs; ++i)
548 if (vmx_msr_index[vmx->guest_msrs[i].index] == msr)
549 return i;
550 return -1;
551}
552
553struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
554{
555 int i;
556
557 i = __find_msr_index(vmx, msr);
558 if (i >= 0)
559 return &vmx->guest_msrs[i];
560 return NULL;
561}
562
563void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs)
564{
565 vmcs_clear(loaded_vmcs->vmcs);
566 if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched)
567 vmcs_clear(loaded_vmcs->shadow_vmcs);
568 loaded_vmcs->cpu = -1;
569 loaded_vmcs->launched = 0;
570}
571
572#ifdef CONFIG_KEXEC_CORE
573/*
574 * This bitmap is used to indicate whether the vmclear
575 * operation is enabled on all cpus. All disabled by
576 * default.
577 */
578static cpumask_t crash_vmclear_enabled_bitmap = CPU_MASK_NONE;
579
580static inline void crash_enable_local_vmclear(int cpu)
581{
582 cpumask_set_cpu(cpu, &crash_vmclear_enabled_bitmap);
583}
584
585static inline void crash_disable_local_vmclear(int cpu)
586{
587 cpumask_clear_cpu(cpu, &crash_vmclear_enabled_bitmap);
588}
589
590static inline int crash_local_vmclear_enabled(int cpu)
591{
592 return cpumask_test_cpu(cpu, &crash_vmclear_enabled_bitmap);
593}
594
595static void crash_vmclear_local_loaded_vmcss(void)
596{
597 int cpu = raw_smp_processor_id();
598 struct loaded_vmcs *v;
599
600 if (!crash_local_vmclear_enabled(cpu))
601 return;
602
603 list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),
604 loaded_vmcss_on_cpu_link)
605 vmcs_clear(v->vmcs);
606}
607#else
608static inline void crash_enable_local_vmclear(int cpu) { }
609static inline void crash_disable_local_vmclear(int cpu) { }
610#endif /* CONFIG_KEXEC_CORE */
611
612static void __loaded_vmcs_clear(void *arg)
613{
614 struct loaded_vmcs *loaded_vmcs = arg;
615 int cpu = raw_smp_processor_id();
616
617 if (loaded_vmcs->cpu != cpu)
618 return; /* vcpu migration can race with cpu offline */
619 if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
620 per_cpu(current_vmcs, cpu) = NULL;
621 crash_disable_local_vmclear(cpu);
622 list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);
623
624 /*
625 * we should ensure updating loaded_vmcs->loaded_vmcss_on_cpu_link
626 * is before setting loaded_vmcs->vcpu to -1 which is done in
627 * loaded_vmcs_init. Otherwise, other cpu can see vcpu = -1 fist
628 * then adds the vmcs into percpu list before it is deleted.
629 */
630 smp_wmb();
631
632 loaded_vmcs_init(loaded_vmcs);
633 crash_enable_local_vmclear(cpu);
634}
635
636void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
637{
638 int cpu = loaded_vmcs->cpu;
639
640 if (cpu != -1)
641 smp_call_function_single(cpu,
642 __loaded_vmcs_clear, loaded_vmcs, 1);
643}
644
645static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg,
646 unsigned field)
647{
648 bool ret;
649 u32 mask = 1 << (seg * SEG_FIELD_NR + field);
650
651 if (!(vmx->vcpu.arch.regs_avail & (1 << VCPU_EXREG_SEGMENTS))) {
652 vmx->vcpu.arch.regs_avail |= (1 << VCPU_EXREG_SEGMENTS);
653 vmx->segment_cache.bitmask = 0;
654 }
655 ret = vmx->segment_cache.bitmask & mask;
656 vmx->segment_cache.bitmask |= mask;
657 return ret;
658}
659
660static u16 vmx_read_guest_seg_selector(struct vcpu_vmx *vmx, unsigned seg)
661{
662 u16 *p = &vmx->segment_cache.seg[seg].selector;
663
664 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_SEL))
665 *p = vmcs_read16(kvm_vmx_segment_fields[seg].selector);
666 return *p;
667}
668
669static ulong vmx_read_guest_seg_base(struct vcpu_vmx *vmx, unsigned seg)
670{
671 ulong *p = &vmx->segment_cache.seg[seg].base;
672
673 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_BASE))
674 *p = vmcs_readl(kvm_vmx_segment_fields[seg].base);
675 return *p;
676}
677
678static u32 vmx_read_guest_seg_limit(struct vcpu_vmx *vmx, unsigned seg)
679{
680 u32 *p = &vmx->segment_cache.seg[seg].limit;
681
682 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_LIMIT))
683 *p = vmcs_read32(kvm_vmx_segment_fields[seg].limit);
684 return *p;
685}
686
687static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg)
688{
689 u32 *p = &vmx->segment_cache.seg[seg].ar;
690
691 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_AR))
692 *p = vmcs_read32(kvm_vmx_segment_fields[seg].ar_bytes);
693 return *p;
694}
695
696void update_exception_bitmap(struct kvm_vcpu *vcpu)
697{
698 u32 eb;
699
700 eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
701 (1u << DB_VECTOR) | (1u << AC_VECTOR);
702 /*
703 * Guest access to VMware backdoor ports could legitimately
704 * trigger #GP because of TSS I/O permission bitmap.
705 * We intercept those #GP and allow access to them anyway
706 * as VMware does.
707 */
708 if (enable_vmware_backdoor)
709 eb |= (1u << GP_VECTOR);
710 if ((vcpu->guest_debug &
711 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==
712 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP))
713 eb |= 1u << BP_VECTOR;
714 if (to_vmx(vcpu)->rmode.vm86_active)
715 eb = ~0;
716 if (enable_ept)
717 eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
718
719 /* When we are running a nested L2 guest and L1 specified for it a
720 * certain exception bitmap, we must trap the same exceptions and pass
721 * them to L1. When running L2, we will only handle the exceptions
722 * specified above if L1 did not want them.
723 */
724 if (is_guest_mode(vcpu))
725 eb |= get_vmcs12(vcpu)->exception_bitmap;
726
727 vmcs_write32(EXCEPTION_BITMAP, eb);
728}
729
730/*
731 * Check if MSR is intercepted for currently loaded MSR bitmap.
732 */
733static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
734{
735 unsigned long *msr_bitmap;
736 int f = sizeof(unsigned long);
737
738 if (!cpu_has_vmx_msr_bitmap())
739 return true;
740
741 msr_bitmap = to_vmx(vcpu)->loaded_vmcs->msr_bitmap;
742
743 if (msr <= 0x1fff) {
744 return !!test_bit(msr, msr_bitmap + 0x800 / f);
745 } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
746 msr &= 0x1fff;
747 return !!test_bit(msr, msr_bitmap + 0xc00 / f);
748 }
749
750 return true;
751}
752
753static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx,
754 unsigned long entry, unsigned long exit)
755{
756 vm_entry_controls_clearbit(vmx, entry);
757 vm_exit_controls_clearbit(vmx, exit);
758}
759
760static int find_msr(struct vmx_msrs *m, unsigned int msr)
761{
762 unsigned int i;
763
764 for (i = 0; i < m->nr; ++i) {
765 if (m->val[i].index == msr)
766 return i;
767 }
768 return -ENOENT;
769}
770
771static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
772{
773 int i;
774 struct msr_autoload *m = &vmx->msr_autoload;
775
776 switch (msr) {
777 case MSR_EFER:
778 if (cpu_has_load_ia32_efer()) {
779 clear_atomic_switch_msr_special(vmx,
780 VM_ENTRY_LOAD_IA32_EFER,
781 VM_EXIT_LOAD_IA32_EFER);
782 return;
783 }
784 break;
785 case MSR_CORE_PERF_GLOBAL_CTRL:
786 if (cpu_has_load_perf_global_ctrl()) {
787 clear_atomic_switch_msr_special(vmx,
788 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
789 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL);
790 return;
791 }
792 break;
793 }
794 i = find_msr(&m->guest, msr);
795 if (i < 0)
796 goto skip_guest;
797 --m->guest.nr;
798 m->guest.val[i] = m->guest.val[m->guest.nr];
799 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr);
800
801skip_guest:
802 i = find_msr(&m->host, msr);
803 if (i < 0)
804 return;
805
806 --m->host.nr;
807 m->host.val[i] = m->host.val[m->host.nr];
808 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr);
809}
810
811static void add_atomic_switch_msr_special(struct vcpu_vmx *vmx,
812 unsigned long entry, unsigned long exit,
813 unsigned long guest_val_vmcs, unsigned long host_val_vmcs,
814 u64 guest_val, u64 host_val)
815{
816 vmcs_write64(guest_val_vmcs, guest_val);
817 if (host_val_vmcs != HOST_IA32_EFER)
818 vmcs_write64(host_val_vmcs, host_val);
819 vm_entry_controls_setbit(vmx, entry);
820 vm_exit_controls_setbit(vmx, exit);
821}
822
823static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
824 u64 guest_val, u64 host_val, bool entry_only)
825{
826 int i, j = 0;
827 struct msr_autoload *m = &vmx->msr_autoload;
828
829 switch (msr) {
830 case MSR_EFER:
831 if (cpu_has_load_ia32_efer()) {
832 add_atomic_switch_msr_special(vmx,
833 VM_ENTRY_LOAD_IA32_EFER,
834 VM_EXIT_LOAD_IA32_EFER,
835 GUEST_IA32_EFER,
836 HOST_IA32_EFER,
837 guest_val, host_val);
838 return;
839 }
840 break;
841 case MSR_CORE_PERF_GLOBAL_CTRL:
842 if (cpu_has_load_perf_global_ctrl()) {
843 add_atomic_switch_msr_special(vmx,
844 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
845 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL,
846 GUEST_IA32_PERF_GLOBAL_CTRL,
847 HOST_IA32_PERF_GLOBAL_CTRL,
848 guest_val, host_val);
849 return;
850 }
851 break;
852 case MSR_IA32_PEBS_ENABLE:
853 /* PEBS needs a quiescent period after being disabled (to write
854 * a record). Disabling PEBS through VMX MSR swapping doesn't
855 * provide that period, so a CPU could write host's record into
856 * guest's memory.
857 */
858 wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
859 }
860
861 i = find_msr(&m->guest, msr);
862 if (!entry_only)
863 j = find_msr(&m->host, msr);
864
865 if (i == NR_AUTOLOAD_MSRS || j == NR_AUTOLOAD_MSRS) {
866 printk_once(KERN_WARNING "Not enough msr switch entries. "
867 "Can't add msr %x\n", msr);
868 return;
869 }
870 if (i < 0) {
871 i = m->guest.nr++;
872 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr);
873 }
874 m->guest.val[i].index = msr;
875 m->guest.val[i].value = guest_val;
876
877 if (entry_only)
878 return;
879
880 if (j < 0) {
881 j = m->host.nr++;
882 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr);
883 }
884 m->host.val[j].index = msr;
885 m->host.val[j].value = host_val;
886}
887
888static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
889{
890 u64 guest_efer = vmx->vcpu.arch.efer;
891 u64 ignore_bits = 0;
892
893 if (!enable_ept) {
894 /*
895 * NX is needed to handle CR0.WP=1, CR4.SMEP=1. Testing
896 * host CPUID is more efficient than testing guest CPUID
897 * or CR4. Host SMEP is anyway a requirement for guest SMEP.
898 */
899 if (boot_cpu_has(X86_FEATURE_SMEP))
900 guest_efer |= EFER_NX;
901 else if (!(guest_efer & EFER_NX))
902 ignore_bits |= EFER_NX;
903 }
904
905 /*
906 * LMA and LME handled by hardware; SCE meaningless outside long mode.
907 */
908 ignore_bits |= EFER_SCE;
909#ifdef CONFIG_X86_64
910 ignore_bits |= EFER_LMA | EFER_LME;
911 /* SCE is meaningful only in long mode on Intel */
912 if (guest_efer & EFER_LMA)
913 ignore_bits &= ~(u64)EFER_SCE;
914#endif
915
916 /*
917 * On EPT, we can't emulate NX, so we must switch EFER atomically.
918 * On CPUs that support "load IA32_EFER", always switch EFER
919 * atomically, since it's faster than switching it manually.
920 */
921 if (cpu_has_load_ia32_efer() ||
922 (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX))) {
923 if (!(guest_efer & EFER_LMA))
924 guest_efer &= ~EFER_LME;
925 if (guest_efer != host_efer)
926 add_atomic_switch_msr(vmx, MSR_EFER,
927 guest_efer, host_efer, false);
928 else
929 clear_atomic_switch_msr(vmx, MSR_EFER);
930 return false;
931 } else {
932 clear_atomic_switch_msr(vmx, MSR_EFER);
933
934 guest_efer &= ~ignore_bits;
935 guest_efer |= host_efer & ignore_bits;
936
937 vmx->guest_msrs[efer_offset].data = guest_efer;
938 vmx->guest_msrs[efer_offset].mask = ~ignore_bits;
939
940 return true;
941 }
942}
943
944#ifdef CONFIG_X86_32
945/*
946 * On 32-bit kernels, VM exits still load the FS and GS bases from the
947 * VMCS rather than the segment table. KVM uses this helper to figure
948 * out the current bases to poke them into the VMCS before entry.
949 */
950static unsigned long segment_base(u16 selector)
951{
952 struct desc_struct *table;
953 unsigned long v;
954
955 if (!(selector & ~SEGMENT_RPL_MASK))
956 return 0;
957
958 table = get_current_gdt_ro();
959
960 if ((selector & SEGMENT_TI_MASK) == SEGMENT_LDT) {
961 u16 ldt_selector = kvm_read_ldt();
962
963 if (!(ldt_selector & ~SEGMENT_RPL_MASK))
964 return 0;
965
966 table = (struct desc_struct *)segment_base(ldt_selector);
967 }
968 v = get_desc_base(&table[selector >> 3]);
969 return v;
970}
971#endif
972
973static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range)
974{
975 u32 i;
976
977 wrmsrl(MSR_IA32_RTIT_STATUS, ctx->status);
978 wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base);
979 wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask);
980 wrmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match);
981 for (i = 0; i < addr_range; i++) {
982 wrmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]);
983 wrmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]);
984 }
985}
986
987static inline void pt_save_msr(struct pt_ctx *ctx, u32 addr_range)
988{
989 u32 i;
990
991 rdmsrl(MSR_IA32_RTIT_STATUS, ctx->status);
992 rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base);
993 rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask);
994 rdmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match);
995 for (i = 0; i < addr_range; i++) {
996 rdmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]);
997 rdmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]);
998 }
999}
1000
1001static void pt_guest_enter(struct vcpu_vmx *vmx)
1002{
1003 if (pt_mode == PT_MODE_SYSTEM)
1004 return;
1005
1006 /*
1007 * GUEST_IA32_RTIT_CTL is already set in the VMCS.
1008 * Save host state before VM entry.
1009 */
1010 rdmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
1011 if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
1012 wrmsrl(MSR_IA32_RTIT_CTL, 0);
1013 pt_save_msr(&vmx->pt_desc.host, vmx->pt_desc.addr_range);
1014 pt_load_msr(&vmx->pt_desc.guest, vmx->pt_desc.addr_range);
1015 }
1016}
1017
1018static void pt_guest_exit(struct vcpu_vmx *vmx)
1019{
1020 if (pt_mode == PT_MODE_SYSTEM)
1021 return;
1022
1023 if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
1024 pt_save_msr(&vmx->pt_desc.guest, vmx->pt_desc.addr_range);
1025 pt_load_msr(&vmx->pt_desc.host, vmx->pt_desc.addr_range);
1026 }
1027
1028 /* Reload host state (IA32_RTIT_CTL will be cleared on VM exit). */
1029 wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
1030}
1031
1032void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
1033{
1034 struct vcpu_vmx *vmx = to_vmx(vcpu);
1035 struct vmcs_host_state *host_state;
1036#ifdef CONFIG_X86_64
1037 int cpu = raw_smp_processor_id();
1038#endif
1039 unsigned long fs_base, gs_base;
1040 u16 fs_sel, gs_sel;
1041 int i;
1042
1043 vmx->req_immediate_exit = false;
1044
1045 /*
1046 * Note that guest MSRs to be saved/restored can also be changed
1047 * when guest state is loaded. This happens when guest transitions
1048 * to/from long-mode by setting MSR_EFER.LMA.
1049 */
1050 if (!vmx->loaded_cpu_state || vmx->guest_msrs_dirty) {
1051 vmx->guest_msrs_dirty = false;
1052 for (i = 0; i < vmx->save_nmsrs; ++i)
1053 kvm_set_shared_msr(vmx->guest_msrs[i].index,
1054 vmx->guest_msrs[i].data,
1055 vmx->guest_msrs[i].mask);
1056
1057 }
1058
1059 if (vmx->loaded_cpu_state)
1060 return;
1061
1062 vmx->loaded_cpu_state = vmx->loaded_vmcs;
1063 host_state = &vmx->loaded_cpu_state->host_state;
1064
1065 /*
1066 * Set host fs and gs selectors. Unfortunately, 22.2.3 does not
1067 * allow segment selectors with cpl > 0 or ti == 1.
1068 */
1069 host_state->ldt_sel = kvm_read_ldt();
1070
1071#ifdef CONFIG_X86_64
1072 savesegment(ds, host_state->ds_sel);
1073 savesegment(es, host_state->es_sel);
1074
1075 gs_base = cpu_kernelmode_gs_base(cpu);
1076 if (likely(is_64bit_mm(current->mm))) {
1077 save_fsgs_for_kvm();
1078 fs_sel = current->thread.fsindex;
1079 gs_sel = current->thread.gsindex;
1080 fs_base = current->thread.fsbase;
1081 vmx->msr_host_kernel_gs_base = current->thread.gsbase;
1082 } else {
1083 savesegment(fs, fs_sel);
1084 savesegment(gs, gs_sel);
1085 fs_base = read_msr(MSR_FS_BASE);
1086 vmx->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
1087 }
1088
1089 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
1090#else
1091 savesegment(fs, fs_sel);
1092 savesegment(gs, gs_sel);
1093 fs_base = segment_base(fs_sel);
1094 gs_base = segment_base(gs_sel);
1095#endif
1096
1097 if (unlikely(fs_sel != host_state->fs_sel)) {
1098 if (!(fs_sel & 7))
1099 vmcs_write16(HOST_FS_SELECTOR, fs_sel);
1100 else
1101 vmcs_write16(HOST_FS_SELECTOR, 0);
1102 host_state->fs_sel = fs_sel;
1103 }
1104 if (unlikely(gs_sel != host_state->gs_sel)) {
1105 if (!(gs_sel & 7))
1106 vmcs_write16(HOST_GS_SELECTOR, gs_sel);
1107 else
1108 vmcs_write16(HOST_GS_SELECTOR, 0);
1109 host_state->gs_sel = gs_sel;
1110 }
1111 if (unlikely(fs_base != host_state->fs_base)) {
1112 vmcs_writel(HOST_FS_BASE, fs_base);
1113 host_state->fs_base = fs_base;
1114 }
1115 if (unlikely(gs_base != host_state->gs_base)) {
1116 vmcs_writel(HOST_GS_BASE, gs_base);
1117 host_state->gs_base = gs_base;
1118 }
1119}
1120
1121static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
1122{
1123 struct vmcs_host_state *host_state;
1124
1125 if (!vmx->loaded_cpu_state)
1126 return;
1127
1128 WARN_ON_ONCE(vmx->loaded_cpu_state != vmx->loaded_vmcs);
1129 host_state = &vmx->loaded_cpu_state->host_state;
1130
1131 ++vmx->vcpu.stat.host_state_reload;
1132 vmx->loaded_cpu_state = NULL;
1133
1134#ifdef CONFIG_X86_64
1135 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
1136#endif
1137 if (host_state->ldt_sel || (host_state->gs_sel & 7)) {
1138 kvm_load_ldt(host_state->ldt_sel);
1139#ifdef CONFIG_X86_64
1140 load_gs_index(host_state->gs_sel);
1141#else
1142 loadsegment(gs, host_state->gs_sel);
1143#endif
1144 }
1145 if (host_state->fs_sel & 7)
1146 loadsegment(fs, host_state->fs_sel);
1147#ifdef CONFIG_X86_64
1148 if (unlikely(host_state->ds_sel | host_state->es_sel)) {
1149 loadsegment(ds, host_state->ds_sel);
1150 loadsegment(es, host_state->es_sel);
1151 }
1152#endif
1153 invalidate_tss_limit();
1154#ifdef CONFIG_X86_64
1155 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
1156#endif
1157 load_fixmap_gdt(raw_smp_processor_id());
1158}
1159
1160#ifdef CONFIG_X86_64
1161static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx)
1162{
1163 preempt_disable();
1164 if (vmx->loaded_cpu_state)
1165 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
1166 preempt_enable();
1167 return vmx->msr_guest_kernel_gs_base;
1168}
1169
1170static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data)
1171{
1172 preempt_disable();
1173 if (vmx->loaded_cpu_state)
1174 wrmsrl(MSR_KERNEL_GS_BASE, data);
1175 preempt_enable();
1176 vmx->msr_guest_kernel_gs_base = data;
1177}
1178#endif
1179
1180static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
1181{
1182 struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
1183 struct pi_desc old, new;
1184 unsigned int dest;
1185
1186 /*
1187 * In case of hot-plug or hot-unplug, we may have to undo
1188 * vmx_vcpu_pi_put even if there is no assigned device. And we
1189 * always keep PI.NDST up to date for simplicity: it makes the
1190 * code easier, and CPU migration is not a fast path.
1191 */
1192 if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu)
1193 return;
1194
1195 /*
1196 * First handle the simple case where no cmpxchg is necessary; just
1197 * allow posting non-urgent interrupts.
1198 *
1199 * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change
1200 * PI.NDST: pi_post_block will do it for us and the wakeup_handler
1201 * expects the VCPU to be on the blocked_vcpu_list that matches
1202 * PI.NDST.
1203 */
1204 if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR ||
1205 vcpu->cpu == cpu) {
1206 pi_clear_sn(pi_desc);
1207 return;
1208 }
1209
1210 /* The full case. */
1211 do {
1212 old.control = new.control = pi_desc->control;
1213
1214 dest = cpu_physical_id(cpu);
1215
1216 if (x2apic_enabled())
1217 new.ndst = dest;
1218 else
1219 new.ndst = (dest << 8) & 0xFF00;
1220
1221 new.sn = 0;
1222 } while (cmpxchg64(&pi_desc->control, old.control,
1223 new.control) != old.control);
1224}
1225
1226/*
1227 * Switches to specified vcpu, until a matching vcpu_put(), but assumes
1228 * vcpu mutex is already taken.
1229 */
1230void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1231{
1232 struct vcpu_vmx *vmx = to_vmx(vcpu);
1233 bool already_loaded = vmx->loaded_vmcs->cpu == cpu;
1234
1235 if (!already_loaded) {
1236 loaded_vmcs_clear(vmx->loaded_vmcs);
1237 local_irq_disable();
1238 crash_disable_local_vmclear(cpu);
1239
1240 /*
1241 * Read loaded_vmcs->cpu should be before fetching
1242 * loaded_vmcs->loaded_vmcss_on_cpu_link.
1243 * See the comments in __loaded_vmcs_clear().
1244 */
1245 smp_rmb();
1246
1247 list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
1248 &per_cpu(loaded_vmcss_on_cpu, cpu));
1249 crash_enable_local_vmclear(cpu);
1250 local_irq_enable();
1251 }
1252
1253 if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) {
1254 per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
1255 vmcs_load(vmx->loaded_vmcs->vmcs);
1256 indirect_branch_prediction_barrier();
1257 }
1258
1259 if (!already_loaded) {
1260 void *gdt = get_current_gdt_ro();
1261 unsigned long sysenter_esp;
1262
1263 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
1264
1265 /*
1266 * Linux uses per-cpu TSS and GDT, so set these when switching
1267 * processors. See 22.2.4.
1268 */
1269 vmcs_writel(HOST_TR_BASE,
1270 (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss);
1271 vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */
1272
1273 /*
1274 * VM exits change the host TR limit to 0x67 after a VM
1275 * exit. This is okay, since 0x67 covers everything except
1276 * the IO bitmap and have have code to handle the IO bitmap
1277 * being lost after a VM exit.
1278 */
1279 BUILD_BUG_ON(IO_BITMAP_OFFSET - 1 != 0x67);
1280
1281 rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
1282 vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
1283
1284 vmx->loaded_vmcs->cpu = cpu;
1285 }
1286
1287 /* Setup TSC multiplier */
1288 if (kvm_has_tsc_control &&
1289 vmx->current_tsc_ratio != vcpu->arch.tsc_scaling_ratio)
1290 decache_tsc_multiplier(vmx);
1291
1292 vmx_vcpu_pi_load(vcpu, cpu);
1293 vmx->host_pkru = read_pkru();
1294 vmx->host_debugctlmsr = get_debugctlmsr();
1295}
1296
1297static void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
1298{
1299 struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
1300
1301 if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
1302 !irq_remapping_cap(IRQ_POSTING_CAP) ||
1303 !kvm_vcpu_apicv_active(vcpu))
1304 return;
1305
1306 /* Set SN when the vCPU is preempted */
1307 if (vcpu->preempted)
1308 pi_set_sn(pi_desc);
1309}
1310
1311void vmx_vcpu_put(struct kvm_vcpu *vcpu)
1312{
1313 vmx_vcpu_pi_put(vcpu);
1314
1315 vmx_prepare_switch_to_host(to_vmx(vcpu));
1316}
1317
1318static bool emulation_required(struct kvm_vcpu *vcpu)
1319{
1320 return emulate_invalid_guest_state && !guest_state_valid(vcpu);
1321}
1322
1323static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu);
1324
1325unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
1326{
1327 unsigned long rflags, save_rflags;
1328
1329 if (!test_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail)) {
1330 __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail);
1331 rflags = vmcs_readl(GUEST_RFLAGS);
1332 if (to_vmx(vcpu)->rmode.vm86_active) {
1333 rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
1334 save_rflags = to_vmx(vcpu)->rmode.save_rflags;
1335 rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
1336 }
1337 to_vmx(vcpu)->rflags = rflags;
1338 }
1339 return to_vmx(vcpu)->rflags;
1340}
1341
1342void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
1343{
1344 unsigned long old_rflags = vmx_get_rflags(vcpu);
1345
1346 __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail);
1347 to_vmx(vcpu)->rflags = rflags;
1348 if (to_vmx(vcpu)->rmode.vm86_active) {
1349 to_vmx(vcpu)->rmode.save_rflags = rflags;
1350 rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
1351 }
1352 vmcs_writel(GUEST_RFLAGS, rflags);
1353
1354 if ((old_rflags ^ to_vmx(vcpu)->rflags) & X86_EFLAGS_VM)
1355 to_vmx(vcpu)->emulation_required = emulation_required(vcpu);
1356}
1357
1358u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu)
1359{
1360 u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
1361 int ret = 0;
1362
1363 if (interruptibility & GUEST_INTR_STATE_STI)
1364 ret |= KVM_X86_SHADOW_INT_STI;
1365 if (interruptibility & GUEST_INTR_STATE_MOV_SS)
1366 ret |= KVM_X86_SHADOW_INT_MOV_SS;
1367
1368 return ret;
1369}
1370
1371void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
1372{
1373 u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
1374 u32 interruptibility = interruptibility_old;
1375
1376 interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS);
1377
1378 if (mask & KVM_X86_SHADOW_INT_MOV_SS)
1379 interruptibility |= GUEST_INTR_STATE_MOV_SS;
1380 else if (mask & KVM_X86_SHADOW_INT_STI)
1381 interruptibility |= GUEST_INTR_STATE_STI;
1382
1383 if ((interruptibility != interruptibility_old))
1384 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility);
1385}
1386
1387static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data)
1388{
1389 struct vcpu_vmx *vmx = to_vmx(vcpu);
1390 unsigned long value;
1391
1392 /*
1393 * Any MSR write that attempts to change bits marked reserved will
1394 * case a #GP fault.
1395 */
1396 if (data & vmx->pt_desc.ctl_bitmask)
1397 return 1;
1398
1399 /*
1400 * Any attempt to modify IA32_RTIT_CTL while TraceEn is set will
1401 * result in a #GP unless the same write also clears TraceEn.
1402 */
1403 if ((vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) &&
1404 ((vmx->pt_desc.guest.ctl ^ data) & ~RTIT_CTL_TRACEEN))
1405 return 1;
1406
1407 /*
1408 * WRMSR to IA32_RTIT_CTL that sets TraceEn but clears this bit
1409 * and FabricEn would cause #GP, if
1410 * CPUID.(EAX=14H, ECX=0):ECX.SNGLRGNOUT[bit 2] = 0
1411 */
1412 if ((data & RTIT_CTL_TRACEEN) && !(data & RTIT_CTL_TOPA) &&
1413 !(data & RTIT_CTL_FABRIC_EN) &&
1414 !intel_pt_validate_cap(vmx->pt_desc.caps,
1415 PT_CAP_single_range_output))
1416 return 1;
1417
1418 /*
1419 * MTCFreq, CycThresh and PSBFreq encodings check, any MSR write that
1420 * utilize encodings marked reserved will casue a #GP fault.
1421 */
1422 value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc_periods);
1423 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc) &&
1424 !test_bit((data & RTIT_CTL_MTC_RANGE) >>
1425 RTIT_CTL_MTC_RANGE_OFFSET, &value))
1426 return 1;
1427 value = intel_pt_validate_cap(vmx->pt_desc.caps,
1428 PT_CAP_cycle_thresholds);
1429 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) &&
1430 !test_bit((data & RTIT_CTL_CYC_THRESH) >>
1431 RTIT_CTL_CYC_THRESH_OFFSET, &value))
1432 return 1;
1433 value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_periods);
1434 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) &&
1435 !test_bit((data & RTIT_CTL_PSB_FREQ) >>
1436 RTIT_CTL_PSB_FREQ_OFFSET, &value))
1437 return 1;
1438
1439 /*
1440 * If ADDRx_CFG is reserved or the encodings is >2 will
1441 * cause a #GP fault.
1442 */
1443 value = (data & RTIT_CTL_ADDR0) >> RTIT_CTL_ADDR0_OFFSET;
1444 if ((value && (vmx->pt_desc.addr_range < 1)) || (value > 2))
1445 return 1;
1446 value = (data & RTIT_CTL_ADDR1) >> RTIT_CTL_ADDR1_OFFSET;
1447 if ((value && (vmx->pt_desc.addr_range < 2)) || (value > 2))
1448 return 1;
1449 value = (data & RTIT_CTL_ADDR2) >> RTIT_CTL_ADDR2_OFFSET;
1450 if ((value && (vmx->pt_desc.addr_range < 3)) || (value > 2))
1451 return 1;
1452 value = (data & RTIT_CTL_ADDR3) >> RTIT_CTL_ADDR3_OFFSET;
1453 if ((value && (vmx->pt_desc.addr_range < 4)) || (value > 2))
1454 return 1;
1455
1456 return 0;
1457}
1458
1459
1460static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
1461{
1462 unsigned long rip;
1463
1464 rip = kvm_rip_read(vcpu);
1465 rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
1466 kvm_rip_write(vcpu, rip);
1467
1468 /* skipping an emulated instruction also counts */
1469 vmx_set_interrupt_shadow(vcpu, 0);
1470}
1471
1472static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
1473{
1474 /*
1475 * Ensure that we clear the HLT state in the VMCS. We don't need to
1476 * explicitly skip the instruction because if the HLT state is set,
1477 * then the instruction is already executing and RIP has already been
1478 * advanced.
1479 */
1480 if (kvm_hlt_in_guest(vcpu->kvm) &&
1481 vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT)
1482 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
1483}
1484
1485static void vmx_queue_exception(struct kvm_vcpu *vcpu)
1486{
1487 struct vcpu_vmx *vmx = to_vmx(vcpu);
1488 unsigned nr = vcpu->arch.exception.nr;
1489 bool has_error_code = vcpu->arch.exception.has_error_code;
1490 u32 error_code = vcpu->arch.exception.error_code;
1491 u32 intr_info = nr | INTR_INFO_VALID_MASK;
1492
1493 kvm_deliver_exception_payload(vcpu);
1494
1495 if (has_error_code) {
1496 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
1497 intr_info |= INTR_INFO_DELIVER_CODE_MASK;
1498 }
1499
1500 if (vmx->rmode.vm86_active) {
1501 int inc_eip = 0;
1502 if (kvm_exception_is_soft(nr))
1503 inc_eip = vcpu->arch.event_exit_inst_len;
1504 if (kvm_inject_realmode_interrupt(vcpu, nr, inc_eip) != EMULATE_DONE)
1505 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
1506 return;
1507 }
1508
1509 WARN_ON_ONCE(vmx->emulation_required);
1510
1511 if (kvm_exception_is_soft(nr)) {
1512 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
1513 vmx->vcpu.arch.event_exit_inst_len);
1514 intr_info |= INTR_TYPE_SOFT_EXCEPTION;
1515 } else
1516 intr_info |= INTR_TYPE_HARD_EXCEPTION;
1517
1518 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
1519
1520 vmx_clear_hlt(vcpu);
1521}
1522
1523static bool vmx_rdtscp_supported(void)
1524{
1525 return cpu_has_vmx_rdtscp();
1526}
1527
1528static bool vmx_invpcid_supported(void)
1529{
1530 return cpu_has_vmx_invpcid();
1531}
1532
1533/*
1534 * Swap MSR entry in host/guest MSR entry array.
1535 */
1536static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
1537{
1538 struct shared_msr_entry tmp;
1539
1540 tmp = vmx->guest_msrs[to];
1541 vmx->guest_msrs[to] = vmx->guest_msrs[from];
1542 vmx->guest_msrs[from] = tmp;
1543}
1544
1545/*
1546 * Set up the vmcs to automatically save and restore system
1547 * msrs. Don't touch the 64-bit msrs if the guest is in legacy
1548 * mode, as fiddling with msrs is very expensive.
1549 */
1550static void setup_msrs(struct vcpu_vmx *vmx)
1551{
1552 int save_nmsrs, index;
1553
1554 save_nmsrs = 0;
1555#ifdef CONFIG_X86_64
1556 /*
1557 * The SYSCALL MSRs are only needed on long mode guests, and only
1558 * when EFER.SCE is set.
1559 */
1560 if (is_long_mode(&vmx->vcpu) && (vmx->vcpu.arch.efer & EFER_SCE)) {
1561 index = __find_msr_index(vmx, MSR_STAR);
1562 if (index >= 0)
1563 move_msr_up(vmx, index, save_nmsrs++);
1564 index = __find_msr_index(vmx, MSR_LSTAR);
1565 if (index >= 0)
1566 move_msr_up(vmx, index, save_nmsrs++);
1567 index = __find_msr_index(vmx, MSR_SYSCALL_MASK);
1568 if (index >= 0)
1569 move_msr_up(vmx, index, save_nmsrs++);
1570 }
1571#endif
1572 index = __find_msr_index(vmx, MSR_EFER);
1573 if (index >= 0 && update_transition_efer(vmx, index))
1574 move_msr_up(vmx, index, save_nmsrs++);
1575 index = __find_msr_index(vmx, MSR_TSC_AUX);
1576 if (index >= 0 && guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP))
1577 move_msr_up(vmx, index, save_nmsrs++);
1578
1579 vmx->save_nmsrs = save_nmsrs;
1580 vmx->guest_msrs_dirty = true;
1581
1582 if (cpu_has_vmx_msr_bitmap())
1583 vmx_update_msr_bitmap(&vmx->vcpu);
1584}
1585
1586static u64 vmx_read_l1_tsc_offset(struct kvm_vcpu *vcpu)
1587{
1588 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1589
1590 if (is_guest_mode(vcpu) &&
1591 (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING))
1592 return vcpu->arch.tsc_offset - vmcs12->tsc_offset;
1593
1594 return vcpu->arch.tsc_offset;
1595}
1596
1597static u64 vmx_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
1598{
1599 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1600 u64 g_tsc_offset = 0;
1601
1602 /*
1603 * We're here if L1 chose not to trap WRMSR to TSC. According
1604 * to the spec, this should set L1's TSC; The offset that L1
1605 * set for L2 remains unchanged, and still needs to be added
1606 * to the newly set TSC to get L2's TSC.
1607 */
1608 if (is_guest_mode(vcpu) &&
1609 (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING))
1610 g_tsc_offset = vmcs12->tsc_offset;
1611
1612 trace_kvm_write_tsc_offset(vcpu->vcpu_id,
1613 vcpu->arch.tsc_offset - g_tsc_offset,
1614 offset);
1615 vmcs_write64(TSC_OFFSET, offset + g_tsc_offset);
1616 return offset + g_tsc_offset;
1617}
1618
1619/*
1620 * nested_vmx_allowed() checks whether a guest should be allowed to use VMX
1621 * instructions and MSRs (i.e., nested VMX). Nested VMX is disabled for
1622 * all guests if the "nested" module option is off, and can also be disabled
1623 * for a single guest by disabling its VMX cpuid bit.
1624 */
1625bool nested_vmx_allowed(struct kvm_vcpu *vcpu)
1626{
1627 return nested && guest_cpuid_has(vcpu, X86_FEATURE_VMX);
1628}
1629
1630static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu,
1631 uint64_t val)
1632{
1633 uint64_t valid_bits = to_vmx(vcpu)->msr_ia32_feature_control_valid_bits;
1634
1635 return !(val & ~valid_bits);
1636}
1637
1638static int vmx_get_msr_feature(struct kvm_msr_entry *msr)
1639{
1640 switch (msr->index) {
1641 case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
1642 if (!nested)
1643 return 1;
1644 return vmx_get_vmx_msr(&vmcs_config.nested, msr->index, &msr->data);
1645 default:
1646 return 1;
1647 }
1648
1649 return 0;
1650}
1651
1652/*
1653 * Reads an msr value (of 'msr_index') into 'pdata'.
1654 * Returns 0 on success, non-0 otherwise.
1655 * Assumes vcpu_load() was already called.
1656 */
1657static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
1658{
1659 struct vcpu_vmx *vmx = to_vmx(vcpu);
1660 struct shared_msr_entry *msr;
1661 u32 index;
1662
1663 switch (msr_info->index) {
1664#ifdef CONFIG_X86_64
1665 case MSR_FS_BASE:
1666 msr_info->data = vmcs_readl(GUEST_FS_BASE);
1667 break;
1668 case MSR_GS_BASE:
1669 msr_info->data = vmcs_readl(GUEST_GS_BASE);
1670 break;
1671 case MSR_KERNEL_GS_BASE:
1672 msr_info->data = vmx_read_guest_kernel_gs_base(vmx);
1673 break;
1674#endif
1675 case MSR_EFER:
1676 return kvm_get_msr_common(vcpu, msr_info);
1677 case MSR_IA32_SPEC_CTRL:
1678 if (!msr_info->host_initiated &&
1679 !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
1680 return 1;
1681
1682 msr_info->data = to_vmx(vcpu)->spec_ctrl;
1683 break;
1684 case MSR_IA32_ARCH_CAPABILITIES:
1685 if (!msr_info->host_initiated &&
1686 !guest_cpuid_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES))
1687 return 1;
1688 msr_info->data = to_vmx(vcpu)->arch_capabilities;
1689 break;
1690 case MSR_IA32_SYSENTER_CS:
1691 msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
1692 break;
1693 case MSR_IA32_SYSENTER_EIP:
1694 msr_info->data = vmcs_readl(GUEST_SYSENTER_EIP);
1695 break;
1696 case MSR_IA32_SYSENTER_ESP:
1697 msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP);
1698 break;
1699 case MSR_IA32_BNDCFGS:
1700 if (!kvm_mpx_supported() ||
1701 (!msr_info->host_initiated &&
1702 !guest_cpuid_has(vcpu, X86_FEATURE_MPX)))
1703 return 1;
1704 msr_info->data = vmcs_read64(GUEST_BNDCFGS);
1705 break;
1706 case MSR_IA32_MCG_EXT_CTL:
1707 if (!msr_info->host_initiated &&
1708 !(vmx->msr_ia32_feature_control &
1709 FEATURE_CONTROL_LMCE))
1710 return 1;
1711 msr_info->data = vcpu->arch.mcg_ext_ctl;
1712 break;
1713 case MSR_IA32_FEATURE_CONTROL:
1714 msr_info->data = vmx->msr_ia32_feature_control;
1715 break;
1716 case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
1717 if (!nested_vmx_allowed(vcpu))
1718 return 1;
1719 return vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index,
1720 &msr_info->data);
1721 case MSR_IA32_XSS:
1722 if (!vmx_xsaves_supported())
1723 return 1;
1724 msr_info->data = vcpu->arch.ia32_xss;
1725 break;
1726 case MSR_IA32_RTIT_CTL:
1727 if (pt_mode != PT_MODE_HOST_GUEST)
1728 return 1;
1729 msr_info->data = vmx->pt_desc.guest.ctl;
1730 break;
1731 case MSR_IA32_RTIT_STATUS:
1732 if (pt_mode != PT_MODE_HOST_GUEST)
1733 return 1;
1734 msr_info->data = vmx->pt_desc.guest.status;
1735 break;
1736 case MSR_IA32_RTIT_CR3_MATCH:
1737 if ((pt_mode != PT_MODE_HOST_GUEST) ||
1738 !intel_pt_validate_cap(vmx->pt_desc.caps,
1739 PT_CAP_cr3_filtering))
1740 return 1;
1741 msr_info->data = vmx->pt_desc.guest.cr3_match;
1742 break;
1743 case MSR_IA32_RTIT_OUTPUT_BASE:
1744 if ((pt_mode != PT_MODE_HOST_GUEST) ||
1745 (!intel_pt_validate_cap(vmx->pt_desc.caps,
1746 PT_CAP_topa_output) &&
1747 !intel_pt_validate_cap(vmx->pt_desc.caps,
1748 PT_CAP_single_range_output)))
1749 return 1;
1750 msr_info->data = vmx->pt_desc.guest.output_base;
1751 break;
1752 case MSR_IA32_RTIT_OUTPUT_MASK:
1753 if ((pt_mode != PT_MODE_HOST_GUEST) ||
1754 (!intel_pt_validate_cap(vmx->pt_desc.caps,
1755 PT_CAP_topa_output) &&
1756 !intel_pt_validate_cap(vmx->pt_desc.caps,
1757 PT_CAP_single_range_output)))
1758 return 1;
1759 msr_info->data = vmx->pt_desc.guest.output_mask;
1760 break;
1761 case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
1762 index = msr_info->index - MSR_IA32_RTIT_ADDR0_A;
1763 if ((pt_mode != PT_MODE_HOST_GUEST) ||
1764 (index >= 2 * intel_pt_validate_cap(vmx->pt_desc.caps,
1765 PT_CAP_num_address_ranges)))
1766 return 1;
1767 if (index % 2)
1768 msr_info->data = vmx->pt_desc.guest.addr_b[index / 2];
1769 else
1770 msr_info->data = vmx->pt_desc.guest.addr_a[index / 2];
1771 break;
1772 case MSR_TSC_AUX:
1773 if (!msr_info->host_initiated &&
1774 !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
1775 return 1;
1776 /* Otherwise falls through */
1777 default:
1778 msr = find_msr_entry(vmx, msr_info->index);
1779 if (msr) {
1780 msr_info->data = msr->data;
1781 break;
1782 }
1783 return kvm_get_msr_common(vcpu, msr_info);
1784 }
1785
1786 return 0;
1787}
1788
1789/*
1790 * Writes msr value into into the appropriate "register".
1791 * Returns 0 on success, non-0 otherwise.
1792 * Assumes vcpu_load() was already called.
1793 */
1794static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
1795{
1796 struct vcpu_vmx *vmx = to_vmx(vcpu);
1797 struct shared_msr_entry *msr;
1798 int ret = 0;
1799 u32 msr_index = msr_info->index;
1800 u64 data = msr_info->data;
1801 u32 index;
1802
1803 switch (msr_index) {
1804 case MSR_EFER:
1805 ret = kvm_set_msr_common(vcpu, msr_info);
1806 break;
1807#ifdef CONFIG_X86_64
1808 case MSR_FS_BASE:
1809 vmx_segment_cache_clear(vmx);
1810 vmcs_writel(GUEST_FS_BASE, data);
1811 break;
1812 case MSR_GS_BASE:
1813 vmx_segment_cache_clear(vmx);
1814 vmcs_writel(GUEST_GS_BASE, data);
1815 break;
1816 case MSR_KERNEL_GS_BASE:
1817 vmx_write_guest_kernel_gs_base(vmx, data);
1818 break;
1819#endif
1820 case MSR_IA32_SYSENTER_CS:
1821 vmcs_write32(GUEST_SYSENTER_CS, data);
1822 break;
1823 case MSR_IA32_SYSENTER_EIP:
1824 vmcs_writel(GUEST_SYSENTER_EIP, data);
1825 break;
1826 case MSR_IA32_SYSENTER_ESP:
1827 vmcs_writel(GUEST_SYSENTER_ESP, data);
1828 break;
1829 case MSR_IA32_BNDCFGS:
1830 if (!kvm_mpx_supported() ||
1831 (!msr_info->host_initiated &&
1832 !guest_cpuid_has(vcpu, X86_FEATURE_MPX)))
1833 return 1;
1834 if (is_noncanonical_address(data & PAGE_MASK, vcpu) ||
1835 (data & MSR_IA32_BNDCFGS_RSVD))
1836 return 1;
1837 vmcs_write64(GUEST_BNDCFGS, data);
1838 break;
1839 case MSR_IA32_SPEC_CTRL:
1840 if (!msr_info->host_initiated &&
1841 !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
1842 return 1;
1843
1844 /* The STIBP bit doesn't fault even if it's not advertised */
1845 if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD))
1846 return 1;
1847
1848 vmx->spec_ctrl = data;
1849
1850 if (!data)
1851 break;
1852
1853 /*
1854 * For non-nested:
1855 * When it's written (to non-zero) for the first time, pass
1856 * it through.
1857 *
1858 * For nested:
1859 * The handling of the MSR bitmap for L2 guests is done in
1860 * nested_vmx_merge_msr_bitmap. We should not touch the
1861 * vmcs02.msr_bitmap here since it gets completely overwritten
1862 * in the merging. We update the vmcs01 here for L1 as well
1863 * since it will end up touching the MSR anyway now.
1864 */
1865 vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap,
1866 MSR_IA32_SPEC_CTRL,
1867 MSR_TYPE_RW);
1868 break;
1869 case MSR_IA32_PRED_CMD:
1870 if (!msr_info->host_initiated &&
1871 !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
1872 return 1;
1873
1874 if (data & ~PRED_CMD_IBPB)
1875 return 1;
1876
1877 if (!data)
1878 break;
1879
1880 wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
1881
1882 /*
1883 * For non-nested:
1884 * When it's written (to non-zero) for the first time, pass
1885 * it through.
1886 *
1887 * For nested:
1888 * The handling of the MSR bitmap for L2 guests is done in
1889 * nested_vmx_merge_msr_bitmap. We should not touch the
1890 * vmcs02.msr_bitmap here since it gets completely overwritten
1891 * in the merging.
1892 */
1893 vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, MSR_IA32_PRED_CMD,
1894 MSR_TYPE_W);
1895 break;
1896 case MSR_IA32_ARCH_CAPABILITIES:
1897 if (!msr_info->host_initiated)
1898 return 1;
1899 vmx->arch_capabilities = data;
1900 break;
1901 case MSR_IA32_CR_PAT:
1902 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
1903 if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
1904 return 1;
1905 vmcs_write64(GUEST_IA32_PAT, data);
1906 vcpu->arch.pat = data;
1907 break;
1908 }
1909 ret = kvm_set_msr_common(vcpu, msr_info);
1910 break;
1911 case MSR_IA32_TSC_ADJUST:
1912 ret = kvm_set_msr_common(vcpu, msr_info);
1913 break;
1914 case MSR_IA32_MCG_EXT_CTL:
1915 if ((!msr_info->host_initiated &&
1916 !(to_vmx(vcpu)->msr_ia32_feature_control &
1917 FEATURE_CONTROL_LMCE)) ||
1918 (data & ~MCG_EXT_CTL_LMCE_EN))
1919 return 1;
1920 vcpu->arch.mcg_ext_ctl = data;
1921 break;
1922 case MSR_IA32_FEATURE_CONTROL:
1923 if (!vmx_feature_control_msr_valid(vcpu, data) ||
1924 (to_vmx(vcpu)->msr_ia32_feature_control &
1925 FEATURE_CONTROL_LOCKED && !msr_info->host_initiated))
1926 return 1;
1927 vmx->msr_ia32_feature_control = data;
1928 if (msr_info->host_initiated && data == 0)
1929 vmx_leave_nested(vcpu);
1930 break;
1931 case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
1932 if (!msr_info->host_initiated)
1933 return 1; /* they are read-only */
1934 if (!nested_vmx_allowed(vcpu))
1935 return 1;
1936 return vmx_set_vmx_msr(vcpu, msr_index, data);
1937 case MSR_IA32_XSS:
1938 if (!vmx_xsaves_supported())
1939 return 1;
1940 /*
1941 * The only supported bit as of Skylake is bit 8, but
1942 * it is not supported on KVM.
1943 */
1944 if (data != 0)
1945 return 1;
1946 vcpu->arch.ia32_xss = data;
1947 if (vcpu->arch.ia32_xss != host_xss)
1948 add_atomic_switch_msr(vmx, MSR_IA32_XSS,
1949 vcpu->arch.ia32_xss, host_xss, false);
1950 else
1951 clear_atomic_switch_msr(vmx, MSR_IA32_XSS);
1952 break;
1953 case MSR_IA32_RTIT_CTL:
1954 if ((pt_mode != PT_MODE_HOST_GUEST) ||
1955 vmx_rtit_ctl_check(vcpu, data) ||
1956 vmx->nested.vmxon)
1957 return 1;
1958 vmcs_write64(GUEST_IA32_RTIT_CTL, data);
1959 vmx->pt_desc.guest.ctl = data;
1960 pt_update_intercept_for_msr(vmx);
1961 break;
1962 case MSR_IA32_RTIT_STATUS:
1963 if ((pt_mode != PT_MODE_HOST_GUEST) ||
1964 (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) ||
1965 (data & MSR_IA32_RTIT_STATUS_MASK))
1966 return 1;
1967 vmx->pt_desc.guest.status = data;
1968 break;
1969 case MSR_IA32_RTIT_CR3_MATCH:
1970 if ((pt_mode != PT_MODE_HOST_GUEST) ||
1971 (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) ||
1972 !intel_pt_validate_cap(vmx->pt_desc.caps,
1973 PT_CAP_cr3_filtering))
1974 return 1;
1975 vmx->pt_desc.guest.cr3_match = data;
1976 break;
1977 case MSR_IA32_RTIT_OUTPUT_BASE:
1978 if ((pt_mode != PT_MODE_HOST_GUEST) ||
1979 (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) ||
1980 (!intel_pt_validate_cap(vmx->pt_desc.caps,
1981 PT_CAP_topa_output) &&
1982 !intel_pt_validate_cap(vmx->pt_desc.caps,
1983 PT_CAP_single_range_output)) ||
1984 (data & MSR_IA32_RTIT_OUTPUT_BASE_MASK))
1985 return 1;
1986 vmx->pt_desc.guest.output_base = data;
1987 break;
1988 case MSR_IA32_RTIT_OUTPUT_MASK:
1989 if ((pt_mode != PT_MODE_HOST_GUEST) ||
1990 (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) ||
1991 (!intel_pt_validate_cap(vmx->pt_desc.caps,
1992 PT_CAP_topa_output) &&
1993 !intel_pt_validate_cap(vmx->pt_desc.caps,
1994 PT_CAP_single_range_output)))
1995 return 1;
1996 vmx->pt_desc.guest.output_mask = data;
1997 break;
1998 case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
1999 index = msr_info->index - MSR_IA32_RTIT_ADDR0_A;
2000 if ((pt_mode != PT_MODE_HOST_GUEST) ||
2001 (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) ||
2002 (index >= 2 * intel_pt_validate_cap(vmx->pt_desc.caps,
2003 PT_CAP_num_address_ranges)))
2004 return 1;
2005 if (index % 2)
2006 vmx->pt_desc.guest.addr_b[index / 2] = data;
2007 else
2008 vmx->pt_desc.guest.addr_a[index / 2] = data;
2009 break;
2010 case MSR_TSC_AUX:
2011 if (!msr_info->host_initiated &&
2012 !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
2013 return 1;
2014 /* Check reserved bit, higher 32 bits should be zero */
2015 if ((data >> 32) != 0)
2016 return 1;
2017 /* Otherwise falls through */
2018 default:
2019 msr = find_msr_entry(vmx, msr_index);
2020 if (msr) {
2021 u64 old_msr_data = msr->data;
2022 msr->data = data;
2023 if (msr - vmx->guest_msrs < vmx->save_nmsrs) {
2024 preempt_disable();
2025 ret = kvm_set_shared_msr(msr->index, msr->data,
2026 msr->mask);
2027 preempt_enable();
2028 if (ret)
2029 msr->data = old_msr_data;
2030 }
2031 break;
2032 }
2033 ret = kvm_set_msr_common(vcpu, msr_info);
2034 }
2035
2036 return ret;
2037}
2038
2039static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
2040{
2041 __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
2042 switch (reg) {
2043 case VCPU_REGS_RSP:
2044 vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
2045 break;
2046 case VCPU_REGS_RIP:
2047 vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP);
2048 break;
2049 case VCPU_EXREG_PDPTR:
2050 if (enable_ept)
2051 ept_save_pdptrs(vcpu);
2052 break;
2053 default:
2054 break;
2055 }
2056}
2057
2058static __init int cpu_has_kvm_support(void)
2059{
2060 return cpu_has_vmx();
2061}
2062
2063static __init int vmx_disabled_by_bios(void)
2064{
2065 u64 msr;
2066
2067 rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
2068 if (msr & FEATURE_CONTROL_LOCKED) {
2069 /* launched w/ TXT and VMX disabled */
2070 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX)
2071 && tboot_enabled())
2072 return 1;
2073 /* launched w/o TXT and VMX only enabled w/ TXT */
2074 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
2075 && (msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX)
2076 && !tboot_enabled()) {
2077 printk(KERN_WARNING "kvm: disable TXT in the BIOS or "
2078 "activate TXT before enabling KVM\n");
2079 return 1;
2080 }
2081 /* launched w/o TXT and VMX disabled */
2082 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
2083 && !tboot_enabled())
2084 return 1;
2085 }
2086
2087 return 0;
2088}
2089
2090static void kvm_cpu_vmxon(u64 addr)
2091{
2092 cr4_set_bits(X86_CR4_VMXE);
2093 intel_pt_handle_vmx(1);
2094
2095 asm volatile ("vmxon %0" : : "m"(addr));
2096}
2097
2098static int hardware_enable(void)
2099{
2100 int cpu = raw_smp_processor_id();
2101 u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
2102 u64 old, test_bits;
2103
2104 if (cr4_read_shadow() & X86_CR4_VMXE)
2105 return -EBUSY;
2106
2107 /*
2108 * This can happen if we hot-added a CPU but failed to allocate
2109 * VP assist page for it.
2110 */
2111 if (static_branch_unlikely(&enable_evmcs) &&
2112 !hv_get_vp_assist_page(cpu))
2113 return -EFAULT;
2114
2115 INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
2116 INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu));
2117 spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
2118
2119 /*
2120 * Now we can enable the vmclear operation in kdump
2121 * since the loaded_vmcss_on_cpu list on this cpu
2122 * has been initialized.
2123 *
2124 * Though the cpu is not in VMX operation now, there
2125 * is no problem to enable the vmclear operation
2126 * for the loaded_vmcss_on_cpu list is empty!
2127 */
2128 crash_enable_local_vmclear(cpu);
2129
2130 rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
2131
2132 test_bits = FEATURE_CONTROL_LOCKED;
2133 test_bits |= FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
2134 if (tboot_enabled())
2135 test_bits |= FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX;
2136
2137 if ((old & test_bits) != test_bits) {
2138 /* enable and lock */
2139 wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits);
2140 }
2141 kvm_cpu_vmxon(phys_addr);
2142 if (enable_ept)
2143 ept_sync_global();
2144
2145 return 0;
2146}
2147
2148static void vmclear_local_loaded_vmcss(void)
2149{
2150 int cpu = raw_smp_processor_id();
2151 struct loaded_vmcs *v, *n;
2152
2153 list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu),
2154 loaded_vmcss_on_cpu_link)
2155 __loaded_vmcs_clear(v);
2156}
2157
2158
2159/* Just like cpu_vmxoff(), but with the __kvm_handle_fault_on_reboot()
2160 * tricks.
2161 */
2162static void kvm_cpu_vmxoff(void)
2163{
2164 asm volatile (__ex("vmxoff"));
2165
2166 intel_pt_handle_vmx(0);
2167 cr4_clear_bits(X86_CR4_VMXE);
2168}
2169
2170static void hardware_disable(void)
2171{
2172 vmclear_local_loaded_vmcss();
2173 kvm_cpu_vmxoff();
2174}
2175
2176static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
2177 u32 msr, u32 *result)
2178{
2179 u32 vmx_msr_low, vmx_msr_high;
2180 u32 ctl = ctl_min | ctl_opt;
2181
2182 rdmsr(msr, vmx_msr_low, vmx_msr_high);
2183
2184 ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */
2185 ctl |= vmx_msr_low; /* bit == 1 in low word ==> must be one */
2186
2187 /* Ensure minimum (required) set of control bits are supported. */
2188 if (ctl_min & ~ctl)
2189 return -EIO;
2190
2191 *result = ctl;
2192 return 0;
2193}
2194
2195static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
2196 struct vmx_capability *vmx_cap)
2197{
2198 u32 vmx_msr_low, vmx_msr_high;
2199 u32 min, opt, min2, opt2;
2200 u32 _pin_based_exec_control = 0;
2201 u32 _cpu_based_exec_control = 0;
2202 u32 _cpu_based_2nd_exec_control = 0;
2203 u32 _vmexit_control = 0;
2204 u32 _vmentry_control = 0;
2205
2206 memset(vmcs_conf, 0, sizeof(*vmcs_conf));
2207 min = CPU_BASED_HLT_EXITING |
2208#ifdef CONFIG_X86_64
2209 CPU_BASED_CR8_LOAD_EXITING |
2210 CPU_BASED_CR8_STORE_EXITING |
2211#endif
2212 CPU_BASED_CR3_LOAD_EXITING |
2213 CPU_BASED_CR3_STORE_EXITING |
2214 CPU_BASED_UNCOND_IO_EXITING |
2215 CPU_BASED_MOV_DR_EXITING |
2216 CPU_BASED_USE_TSC_OFFSETING |
2217 CPU_BASED_MWAIT_EXITING |
2218 CPU_BASED_MONITOR_EXITING |
2219 CPU_BASED_INVLPG_EXITING |
2220 CPU_BASED_RDPMC_EXITING;
2221
2222 opt = CPU_BASED_TPR_SHADOW |
2223 CPU_BASED_USE_MSR_BITMAPS |
2224 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
2225 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
2226 &_cpu_based_exec_control) < 0)
2227 return -EIO;
2228#ifdef CONFIG_X86_64
2229 if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
2230 _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING &
2231 ~CPU_BASED_CR8_STORE_EXITING;
2232#endif
2233 if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
2234 min2 = 0;
2235 opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
2236 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
2237 SECONDARY_EXEC_WBINVD_EXITING |
2238 SECONDARY_EXEC_ENABLE_VPID |
2239 SECONDARY_EXEC_ENABLE_EPT |
2240 SECONDARY_EXEC_UNRESTRICTED_GUEST |
2241 SECONDARY_EXEC_PAUSE_LOOP_EXITING |
2242 SECONDARY_EXEC_DESC |
2243 SECONDARY_EXEC_RDTSCP |
2244 SECONDARY_EXEC_ENABLE_INVPCID |
2245 SECONDARY_EXEC_APIC_REGISTER_VIRT |
2246 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
2247 SECONDARY_EXEC_SHADOW_VMCS |
2248 SECONDARY_EXEC_XSAVES |
2249 SECONDARY_EXEC_RDSEED_EXITING |
2250 SECONDARY_EXEC_RDRAND_EXITING |
2251 SECONDARY_EXEC_ENABLE_PML |
2252 SECONDARY_EXEC_TSC_SCALING |
2253 SECONDARY_EXEC_PT_USE_GPA |
2254 SECONDARY_EXEC_PT_CONCEAL_VMX |
2255 SECONDARY_EXEC_ENABLE_VMFUNC |
2256 SECONDARY_EXEC_ENCLS_EXITING;
2257 if (adjust_vmx_controls(min2, opt2,
2258 MSR_IA32_VMX_PROCBASED_CTLS2,
2259 &_cpu_based_2nd_exec_control) < 0)
2260 return -EIO;
2261 }
2262#ifndef CONFIG_X86_64
2263 if (!(_cpu_based_2nd_exec_control &
2264 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
2265 _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
2266#endif
2267
2268 if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
2269 _cpu_based_2nd_exec_control &= ~(
2270 SECONDARY_EXEC_APIC_REGISTER_VIRT |
2271 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
2272 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
2273
2274 rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP,
2275 &vmx_cap->ept, &vmx_cap->vpid);
2276
2277 if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) {
2278 /* CR3 accesses and invlpg don't need to cause VM Exits when EPT
2279 enabled */
2280 _cpu_based_exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING |
2281 CPU_BASED_CR3_STORE_EXITING |
2282 CPU_BASED_INVLPG_EXITING);
2283 } else if (vmx_cap->ept) {
2284 vmx_cap->ept = 0;
2285 pr_warn_once("EPT CAP should not exist if not support "
2286 "1-setting enable EPT VM-execution control\n");
2287 }
2288 if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) &&
2289 vmx_cap->vpid) {
2290 vmx_cap->vpid = 0;
2291 pr_warn_once("VPID CAP should not exist if not support "
2292 "1-setting enable VPID VM-execution control\n");
2293 }
2294
2295 min = VM_EXIT_SAVE_DEBUG_CONTROLS | VM_EXIT_ACK_INTR_ON_EXIT;
2296#ifdef CONFIG_X86_64
2297 min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
2298#endif
2299 opt = VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL |
2300 VM_EXIT_SAVE_IA32_PAT |
2301 VM_EXIT_LOAD_IA32_PAT |
2302 VM_EXIT_LOAD_IA32_EFER |
2303 VM_EXIT_CLEAR_BNDCFGS |
2304 VM_EXIT_PT_CONCEAL_PIP |
2305 VM_EXIT_CLEAR_IA32_RTIT_CTL;
2306 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
2307 &_vmexit_control) < 0)
2308 return -EIO;
2309
2310 min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
2311 opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR |
2312 PIN_BASED_VMX_PREEMPTION_TIMER;
2313 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
2314 &_pin_based_exec_control) < 0)
2315 return -EIO;
2316
2317 if (cpu_has_broken_vmx_preemption_timer())
2318 _pin_based_exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
2319 if (!(_cpu_based_2nd_exec_control &
2320 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY))
2321 _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR;
2322
2323 min = VM_ENTRY_LOAD_DEBUG_CONTROLS;
2324 opt = VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL |
2325 VM_ENTRY_LOAD_IA32_PAT |
2326 VM_ENTRY_LOAD_IA32_EFER |
2327 VM_ENTRY_LOAD_BNDCFGS |
2328 VM_ENTRY_PT_CONCEAL_PIP |
2329 VM_ENTRY_LOAD_IA32_RTIT_CTL;
2330 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
2331 &_vmentry_control) < 0)
2332 return -EIO;
2333
2334 /*
2335 * Some cpus support VM_{ENTRY,EXIT}_IA32_PERF_GLOBAL_CTRL but they
2336 * can't be used due to an errata where VM Exit may incorrectly clear
2337 * IA32_PERF_GLOBAL_CTRL[34:32]. Workaround the errata by using the
2338 * MSR load mechanism to switch IA32_PERF_GLOBAL_CTRL.
2339 */
2340 if (boot_cpu_data.x86 == 0x6) {
2341 switch (boot_cpu_data.x86_model) {
2342 case 26: /* AAK155 */
2343 case 30: /* AAP115 */
2344 case 37: /* AAT100 */
2345 case 44: /* BC86,AAY89,BD102 */
2346 case 46: /* BA97 */
2347 _vmexit_control &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
2348 _vmexit_control &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
2349 pr_warn_once("kvm: VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL "
2350 "does not work properly. Using workaround\n");
2351 break;
2352 default:
2353 break;
2354 }
2355 }
2356
2357
2358 rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high);
2359
2360 /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
2361 if ((vmx_msr_high & 0x1fff) > PAGE_SIZE)
2362 return -EIO;
2363
2364#ifdef CONFIG_X86_64
2365 /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */
2366 if (vmx_msr_high & (1u<<16))
2367 return -EIO;
2368#endif
2369
2370 /* Require Write-Back (WB) memory type for VMCS accesses. */
2371 if (((vmx_msr_high >> 18) & 15) != 6)
2372 return -EIO;
2373
2374 vmcs_conf->size = vmx_msr_high & 0x1fff;
2375 vmcs_conf->order = get_order(vmcs_conf->size);
2376 vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff;
2377
2378 vmcs_conf->revision_id = vmx_msr_low;
2379
2380 vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
2381 vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
2382 vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
2383 vmcs_conf->vmexit_ctrl = _vmexit_control;
2384 vmcs_conf->vmentry_ctrl = _vmentry_control;
2385
2386 if (static_branch_unlikely(&enable_evmcs))
2387 evmcs_sanitize_exec_ctrls(vmcs_conf);
2388
2389 return 0;
2390}
2391
2392struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu)
2393{
2394 int node = cpu_to_node(cpu);
2395 struct page *pages;
2396 struct vmcs *vmcs;
2397
2398 pages = __alloc_pages_node(node, GFP_KERNEL, vmcs_config.order);
2399 if (!pages)
2400 return NULL;
2401 vmcs = page_address(pages);
2402 memset(vmcs, 0, vmcs_config.size);
2403
2404 /* KVM supports Enlightened VMCS v1 only */
2405 if (static_branch_unlikely(&enable_evmcs))
2406 vmcs->hdr.revision_id = KVM_EVMCS_VERSION;
2407 else
2408 vmcs->hdr.revision_id = vmcs_config.revision_id;
2409
2410 if (shadow)
2411 vmcs->hdr.shadow_vmcs = 1;
2412 return vmcs;
2413}
2414
2415void free_vmcs(struct vmcs *vmcs)
2416{
2417 free_pages((unsigned long)vmcs, vmcs_config.order);
2418}
2419
2420/*
2421 * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded
2422 */
2423void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
2424{
2425 if (!loaded_vmcs->vmcs)
2426 return;
2427 loaded_vmcs_clear(loaded_vmcs);
2428 free_vmcs(loaded_vmcs->vmcs);
2429 loaded_vmcs->vmcs = NULL;
2430 if (loaded_vmcs->msr_bitmap)
2431 free_page((unsigned long)loaded_vmcs->msr_bitmap);
2432 WARN_ON(loaded_vmcs->shadow_vmcs != NULL);
2433}
2434
2435int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
2436{
2437 loaded_vmcs->vmcs = alloc_vmcs(false);
2438 if (!loaded_vmcs->vmcs)
2439 return -ENOMEM;
2440
2441 loaded_vmcs->shadow_vmcs = NULL;
2442 loaded_vmcs_init(loaded_vmcs);
2443
2444 if (cpu_has_vmx_msr_bitmap()) {
2445 loaded_vmcs->msr_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
2446 if (!loaded_vmcs->msr_bitmap)
2447 goto out_vmcs;
2448 memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE);
2449
2450 if (IS_ENABLED(CONFIG_HYPERV) &&
2451 static_branch_unlikely(&enable_evmcs) &&
2452 (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) {
2453 struct hv_enlightened_vmcs *evmcs =
2454 (struct hv_enlightened_vmcs *)loaded_vmcs->vmcs;
2455
2456 evmcs->hv_enlightenments_control.msr_bitmap = 1;
2457 }
2458 }
2459
2460 memset(&loaded_vmcs->host_state, 0, sizeof(struct vmcs_host_state));
2461
2462 return 0;
2463
2464out_vmcs:
2465 free_loaded_vmcs(loaded_vmcs);
2466 return -ENOMEM;
2467}
2468
2469static void free_kvm_area(void)
2470{
2471 int cpu;
2472
2473 for_each_possible_cpu(cpu) {
2474 free_vmcs(per_cpu(vmxarea, cpu));
2475 per_cpu(vmxarea, cpu) = NULL;
2476 }
2477}
2478
2479static __init int alloc_kvm_area(void)
2480{
2481 int cpu;
2482
2483 for_each_possible_cpu(cpu) {
2484 struct vmcs *vmcs;
2485
2486 vmcs = alloc_vmcs_cpu(false, cpu);
2487 if (!vmcs) {
2488 free_kvm_area();
2489 return -ENOMEM;
2490 }
2491
2492 /*
2493 * When eVMCS is enabled, alloc_vmcs_cpu() sets
2494 * vmcs->revision_id to KVM_EVMCS_VERSION instead of
2495 * revision_id reported by MSR_IA32_VMX_BASIC.
2496 *
2497 * However, even though not explictly documented by
2498 * TLFS, VMXArea passed as VMXON argument should
2499 * still be marked with revision_id reported by
2500 * physical CPU.
2501 */
2502 if (static_branch_unlikely(&enable_evmcs))
2503 vmcs->hdr.revision_id = vmcs_config.revision_id;
2504
2505 per_cpu(vmxarea, cpu) = vmcs;
2506 }
2507 return 0;
2508}
2509
2510static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg,
2511 struct kvm_segment *save)
2512{
2513 if (!emulate_invalid_guest_state) {
2514 /*
2515 * CS and SS RPL should be equal during guest entry according
2516 * to VMX spec, but in reality it is not always so. Since vcpu
2517 * is in the middle of the transition from real mode to
2518 * protected mode it is safe to assume that RPL 0 is a good
2519 * default value.
2520 */
2521 if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS)
2522 save->selector &= ~SEGMENT_RPL_MASK;
2523 save->dpl = save->selector & SEGMENT_RPL_MASK;
2524 save->s = 1;
2525 }
2526 vmx_set_segment(vcpu, save, seg);
2527}
2528
2529static void enter_pmode(struct kvm_vcpu *vcpu)
2530{
2531 unsigned long flags;
2532 struct vcpu_vmx *vmx = to_vmx(vcpu);
2533
2534 /*
2535 * Update real mode segment cache. It may be not up-to-date if sement
2536 * register was written while vcpu was in a guest mode.
2537 */
2538 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
2539 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
2540 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
2541 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
2542 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
2543 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);
2544
2545 vmx->rmode.vm86_active = 0;
2546
2547 vmx_segment_cache_clear(vmx);
2548
2549 vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
2550
2551 flags = vmcs_readl(GUEST_RFLAGS);
2552 flags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
2553 flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
2554 vmcs_writel(GUEST_RFLAGS, flags);
2555
2556 vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |
2557 (vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME));
2558
2559 update_exception_bitmap(vcpu);
2560
2561 fix_pmode_seg(vcpu, VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
2562 fix_pmode_seg(vcpu, VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
2563 fix_pmode_seg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
2564 fix_pmode_seg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
2565 fix_pmode_seg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
2566 fix_pmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
2567}
2568
2569static void fix_rmode_seg(int seg, struct kvm_segment *save)
2570{
2571 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
2572 struct kvm_segment var = *save;
2573
2574 var.dpl = 0x3;
2575 if (seg == VCPU_SREG_CS)
2576 var.type = 0x3;
2577
2578 if (!emulate_invalid_guest_state) {
2579 var.selector = var.base >> 4;
2580 var.base = var.base & 0xffff0;
2581 var.limit = 0xffff;
2582 var.g = 0;
2583 var.db = 0;
2584 var.present = 1;
2585 var.s = 1;
2586 var.l = 0;
2587 var.unusable = 0;
2588 var.type = 0x3;
2589 var.avl = 0;
2590 if (save->base & 0xf)
2591 printk_once(KERN_WARNING "kvm: segment base is not "
2592 "paragraph aligned when entering "
2593 "protected mode (seg=%d)", seg);
2594 }
2595
2596 vmcs_write16(sf->selector, var.selector);
2597 vmcs_writel(sf->base, var.base);
2598 vmcs_write32(sf->limit, var.limit);
2599 vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(&var));
2600}
2601
2602static void enter_rmode(struct kvm_vcpu *vcpu)
2603{
2604 unsigned long flags;
2605 struct vcpu_vmx *vmx = to_vmx(vcpu);
2606 struct kvm_vmx *kvm_vmx = to_kvm_vmx(vcpu->kvm);
2607
2608 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
2609 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
2610 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
2611 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
2612 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
2613 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
2614 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);
2615
2616 vmx->rmode.vm86_active = 1;
2617
2618 /*
2619 * Very old userspace does not call KVM_SET_TSS_ADDR before entering
2620 * vcpu. Warn the user that an update is overdue.
2621 */
2622 if (!kvm_vmx->tss_addr)
2623 printk_once(KERN_WARNING "kvm: KVM_SET_TSS_ADDR need to be "
2624 "called before entering vcpu\n");
2625
2626 vmx_segment_cache_clear(vmx);
2627
2628 vmcs_writel(GUEST_TR_BASE, kvm_vmx->tss_addr);
2629 vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
2630 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
2631
2632 flags = vmcs_readl(GUEST_RFLAGS);
2633 vmx->rmode.save_rflags = flags;
2634
2635 flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
2636
2637 vmcs_writel(GUEST_RFLAGS, flags);
2638 vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME);
2639 update_exception_bitmap(vcpu);
2640
2641 fix_rmode_seg(VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
2642 fix_rmode_seg(VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
2643 fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
2644 fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
2645 fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
2646 fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
2647
2648 kvm_mmu_reset_context(vcpu);
2649}
2650
2651void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
2652{
2653 struct vcpu_vmx *vmx = to_vmx(vcpu);
2654 struct shared_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
2655
2656 if (!msr)
2657 return;
2658
2659 vcpu->arch.efer = efer;
2660 if (efer & EFER_LMA) {
2661 vm_entry_controls_setbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
2662 msr->data = efer;
2663 } else {
2664 vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
2665
2666 msr->data = efer & ~EFER_LME;
2667 }
2668 setup_msrs(vmx);
2669}
2670
2671#ifdef CONFIG_X86_64
2672
2673static void enter_lmode(struct kvm_vcpu *vcpu)
2674{
2675 u32 guest_tr_ar;
2676
2677 vmx_segment_cache_clear(to_vmx(vcpu));
2678
2679 guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
2680 if ((guest_tr_ar & VMX_AR_TYPE_MASK) != VMX_AR_TYPE_BUSY_64_TSS) {
2681 pr_debug_ratelimited("%s: tss fixup for long mode. \n",
2682 __func__);
2683 vmcs_write32(GUEST_TR_AR_BYTES,
2684 (guest_tr_ar & ~VMX_AR_TYPE_MASK)
2685 | VMX_AR_TYPE_BUSY_64_TSS);
2686 }
2687 vmx_set_efer(vcpu, vcpu->arch.efer | EFER_LMA);
2688}
2689
2690static void exit_lmode(struct kvm_vcpu *vcpu)
2691{
2692 vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
2693 vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA);
2694}
2695
2696#endif
2697
2698static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr)
2699{
2700 int vpid = to_vmx(vcpu)->vpid;
2701
2702 if (!vpid_sync_vcpu_addr(vpid, addr))
2703 vpid_sync_context(vpid);
2704
2705 /*
2706 * If VPIDs are not supported or enabled, then the above is a no-op.
2707 * But we don't really need a TLB flush in that case anyway, because
2708 * each VM entry/exit includes an implicit flush when VPID is 0.
2709 */
2710}
2711
2712static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
2713{
2714 ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;
2715
2716 vcpu->arch.cr0 &= ~cr0_guest_owned_bits;
2717 vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits;
2718}
2719
2720static void vmx_decache_cr3(struct kvm_vcpu *vcpu)
2721{
2722 if (enable_unrestricted_guest || (enable_ept && is_paging(vcpu)))
2723 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
2724 __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
2725}
2726
2727static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
2728{
2729 ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits;
2730
2731 vcpu->arch.cr4 &= ~cr4_guest_owned_bits;
2732 vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & cr4_guest_owned_bits;
2733}
2734
2735static void ept_load_pdptrs(struct kvm_vcpu *vcpu)
2736{
2737 struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
2738
2739 if (!test_bit(VCPU_EXREG_PDPTR,
2740 (unsigned long *)&vcpu->arch.regs_dirty))
2741 return;
2742
2743 if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
2744 vmcs_write64(GUEST_PDPTR0, mmu->pdptrs[0]);
2745 vmcs_write64(GUEST_PDPTR1, mmu->pdptrs[1]);
2746 vmcs_write64(GUEST_PDPTR2, mmu->pdptrs[2]);
2747 vmcs_write64(GUEST_PDPTR3, mmu->pdptrs[3]);
2748 }
2749}
2750
2751void ept_save_pdptrs(struct kvm_vcpu *vcpu)
2752{
2753 struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
2754
2755 if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
2756 mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0);
2757 mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1);
2758 mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2);
2759 mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3);
2760 }
2761
2762 __set_bit(VCPU_EXREG_PDPTR,
2763 (unsigned long *)&vcpu->arch.regs_avail);
2764 __set_bit(VCPU_EXREG_PDPTR,
2765 (unsigned long *)&vcpu->arch.regs_dirty);
2766}
2767
2768static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
2769 unsigned long cr0,
2770 struct kvm_vcpu *vcpu)
2771{
2772 if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
2773 vmx_decache_cr3(vcpu);
2774 if (!(cr0 & X86_CR0_PG)) {
2775 /* From paging/starting to nonpaging */
2776 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
2777 vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) |
2778 (CPU_BASED_CR3_LOAD_EXITING |
2779 CPU_BASED_CR3_STORE_EXITING));
2780 vcpu->arch.cr0 = cr0;
2781 vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
2782 } else if (!is_paging(vcpu)) {
2783 /* From nonpaging to paging */
2784 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
2785 vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) &
2786 ~(CPU_BASED_CR3_LOAD_EXITING |
2787 CPU_BASED_CR3_STORE_EXITING));
2788 vcpu->arch.cr0 = cr0;
2789 vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
2790 }
2791
2792 if (!(cr0 & X86_CR0_WP))
2793 *hw_cr0 &= ~X86_CR0_WP;
2794}
2795
2796void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
2797{
2798 struct vcpu_vmx *vmx = to_vmx(vcpu);
2799 unsigned long hw_cr0;
2800
2801 hw_cr0 = (cr0 & ~KVM_VM_CR0_ALWAYS_OFF);
2802 if (enable_unrestricted_guest)
2803 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST;
2804 else {
2805 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON;
2806
2807 if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE))
2808 enter_pmode(vcpu);
2809
2810 if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE))
2811 enter_rmode(vcpu);
2812 }
2813
2814#ifdef CONFIG_X86_64
2815 if (vcpu->arch.efer & EFER_LME) {
2816 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG))
2817 enter_lmode(vcpu);
2818 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG))
2819 exit_lmode(vcpu);
2820 }
2821#endif
2822
2823 if (enable_ept && !enable_unrestricted_guest)
2824 ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu);
2825
2826 vmcs_writel(CR0_READ_SHADOW, cr0);
2827 vmcs_writel(GUEST_CR0, hw_cr0);
2828 vcpu->arch.cr0 = cr0;
2829
2830 /* depends on vcpu->arch.cr0 to be set to a new value */
2831 vmx->emulation_required = emulation_required(vcpu);
2832}
2833
2834static int get_ept_level(struct kvm_vcpu *vcpu)
2835{
2836 if (cpu_has_vmx_ept_5levels() && (cpuid_maxphyaddr(vcpu) > 48))
2837 return 5;
2838 return 4;
2839}
2840
2841u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa)
2842{
2843 u64 eptp = VMX_EPTP_MT_WB;
2844
2845 eptp |= (get_ept_level(vcpu) == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4;
2846
2847 if (enable_ept_ad_bits &&
2848 (!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu)))
2849 eptp |= VMX_EPTP_AD_ENABLE_BIT;
2850 eptp |= (root_hpa & PAGE_MASK);
2851
2852 return eptp;
2853}
2854
2855void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
2856{
2857 struct kvm *kvm = vcpu->kvm;
2858 unsigned long guest_cr3;
2859 u64 eptp;
2860
2861 guest_cr3 = cr3;
2862 if (enable_ept) {
2863 eptp = construct_eptp(vcpu, cr3);
2864 vmcs_write64(EPT_POINTER, eptp);
2865
2866 if (kvm_x86_ops->tlb_remote_flush) {
2867 spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock);
2868 to_vmx(vcpu)->ept_pointer = eptp;
2869 to_kvm_vmx(kvm)->ept_pointers_match
2870 = EPT_POINTERS_CHECK;
2871 spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock);
2872 }
2873
2874 if (enable_unrestricted_guest || is_paging(vcpu) ||
2875 is_guest_mode(vcpu))
2876 guest_cr3 = kvm_read_cr3(vcpu);
2877 else
2878 guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr;
2879 ept_load_pdptrs(vcpu);
2880 }
2881
2882 vmcs_writel(GUEST_CR3, guest_cr3);
2883}
2884
2885int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
2886{
2887 /*
2888 * Pass through host's Machine Check Enable value to hw_cr4, which
2889 * is in force while we are in guest mode. Do not let guests control
2890 * this bit, even if host CR4.MCE == 0.
2891 */
2892 unsigned long hw_cr4;
2893
2894 hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE);
2895 if (enable_unrestricted_guest)
2896 hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST;
2897 else if (to_vmx(vcpu)->rmode.vm86_active)
2898 hw_cr4 |= KVM_RMODE_VM_CR4_ALWAYS_ON;
2899 else
2900 hw_cr4 |= KVM_PMODE_VM_CR4_ALWAYS_ON;
2901
2902 if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated()) {
2903 if (cr4 & X86_CR4_UMIP) {
2904 vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
2905 SECONDARY_EXEC_DESC);
2906 hw_cr4 &= ~X86_CR4_UMIP;
2907 } else if (!is_guest_mode(vcpu) ||
2908 !nested_cpu_has2(get_vmcs12(vcpu), SECONDARY_EXEC_DESC))
2909 vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL,
2910 SECONDARY_EXEC_DESC);
2911 }
2912
2913 if (cr4 & X86_CR4_VMXE) {
2914 /*
2915 * To use VMXON (and later other VMX instructions), a guest
2916 * must first be able to turn on cr4.VMXE (see handle_vmon()).
2917 * So basically the check on whether to allow nested VMX
2918 * is here. We operate under the default treatment of SMM,
2919 * so VMX cannot be enabled under SMM.
2920 */
2921 if (!nested_vmx_allowed(vcpu) || is_smm(vcpu))
2922 return 1;
2923 }
2924
2925 if (to_vmx(vcpu)->nested.vmxon && !nested_cr4_valid(vcpu, cr4))
2926 return 1;
2927
2928 vcpu->arch.cr4 = cr4;
2929
2930 if (!enable_unrestricted_guest) {
2931 if (enable_ept) {
2932 if (!is_paging(vcpu)) {
2933 hw_cr4 &= ~X86_CR4_PAE;
2934 hw_cr4 |= X86_CR4_PSE;
2935 } else if (!(cr4 & X86_CR4_PAE)) {
2936 hw_cr4 &= ~X86_CR4_PAE;
2937 }
2938 }
2939
2940 /*
2941 * SMEP/SMAP/PKU is disabled if CPU is in non-paging mode in
2942 * hardware. To emulate this behavior, SMEP/SMAP/PKU needs
2943 * to be manually disabled when guest switches to non-paging
2944 * mode.
2945 *
2946 * If !enable_unrestricted_guest, the CPU is always running
2947 * with CR0.PG=1 and CR4 needs to be modified.
2948 * If enable_unrestricted_guest, the CPU automatically
2949 * disables SMEP/SMAP/PKU when the guest sets CR0.PG=0.
2950 */
2951 if (!is_paging(vcpu))
2952 hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE);
2953 }
2954
2955 vmcs_writel(CR4_READ_SHADOW, cr4);
2956 vmcs_writel(GUEST_CR4, hw_cr4);
2957 return 0;
2958}
2959
2960void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
2961{
2962 struct vcpu_vmx *vmx = to_vmx(vcpu);
2963 u32 ar;
2964
2965 if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
2966 *var = vmx->rmode.segs[seg];
2967 if (seg == VCPU_SREG_TR
2968 || var->selector == vmx_read_guest_seg_selector(vmx, seg))
2969 return;
2970 var->base = vmx_read_guest_seg_base(vmx, seg);
2971 var->selector = vmx_read_guest_seg_selector(vmx, seg);
2972 return;
2973 }
2974 var->base = vmx_read_guest_seg_base(vmx, seg);
2975 var->limit = vmx_read_guest_seg_limit(vmx, seg);
2976 var->selector = vmx_read_guest_seg_selector(vmx, seg);
2977 ar = vmx_read_guest_seg_ar(vmx, seg);
2978 var->unusable = (ar >> 16) & 1;
2979 var->type = ar & 15;
2980 var->s = (ar >> 4) & 1;
2981 var->dpl = (ar >> 5) & 3;
2982 /*
2983 * Some userspaces do not preserve unusable property. Since usable
2984 * segment has to be present according to VMX spec we can use present
2985 * property to amend userspace bug by making unusable segment always
2986 * nonpresent. vmx_segment_access_rights() already marks nonpresent
2987 * segment as unusable.
2988 */
2989 var->present = !var->unusable;
2990 var->avl = (ar >> 12) & 1;
2991 var->l = (ar >> 13) & 1;
2992 var->db = (ar >> 14) & 1;
2993 var->g = (ar >> 15) & 1;
2994}
2995
2996static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
2997{
2998 struct kvm_segment s;
2999
3000 if (to_vmx(vcpu)->rmode.vm86_active) {
3001 vmx_get_segment(vcpu, &s, seg);
3002 return s.base;
3003 }
3004 return vmx_read_guest_seg_base(to_vmx(vcpu), seg);
3005}
3006
3007int vmx_get_cpl(struct kvm_vcpu *vcpu)
3008{
3009 struct vcpu_vmx *vmx = to_vmx(vcpu);
3010
3011 if (unlikely(vmx->rmode.vm86_active))
3012 return 0;
3013 else {
3014 int ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS);
3015 return VMX_AR_DPL(ar);
3016 }
3017}
3018
3019static u32 vmx_segment_access_rights(struct kvm_segment *var)
3020{
3021 u32 ar;
3022
3023 if (var->unusable || !var->present)
3024 ar = 1 << 16;
3025 else {
3026 ar = var->type & 15;
3027 ar |= (var->s & 1) << 4;
3028 ar |= (var->dpl & 3) << 5;
3029 ar |= (var->present & 1) << 7;
3030 ar |= (var->avl & 1) << 12;
3031 ar |= (var->l & 1) << 13;
3032 ar |= (var->db & 1) << 14;
3033 ar |= (var->g & 1) << 15;
3034 }
3035
3036 return ar;
3037}
3038
3039void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
3040{
3041 struct vcpu_vmx *vmx = to_vmx(vcpu);
3042 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
3043
3044 vmx_segment_cache_clear(vmx);
3045
3046 if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
3047 vmx->rmode.segs[seg] = *var;
3048 if (seg == VCPU_SREG_TR)
3049 vmcs_write16(sf->selector, var->selector);
3050 else if (var->s)
3051 fix_rmode_seg(seg, &vmx->rmode.segs[seg]);
3052 goto out;
3053 }
3054
3055 vmcs_writel(sf->base, var->base);
3056 vmcs_write32(sf->limit, var->limit);
3057 vmcs_write16(sf->selector, var->selector);
3058
3059 /*
3060 * Fix the "Accessed" bit in AR field of segment registers for older
3061 * qemu binaries.
3062 * IA32 arch specifies that at the time of processor reset the
3063 * "Accessed" bit in the AR field of segment registers is 1. And qemu
3064 * is setting it to 0 in the userland code. This causes invalid guest
3065 * state vmexit when "unrestricted guest" mode is turned on.
3066 * Fix for this setup issue in cpu_reset is being pushed in the qemu
3067 * tree. Newer qemu binaries with that qemu fix would not need this
3068 * kvm hack.
3069 */
3070 if (enable_unrestricted_guest && (seg != VCPU_SREG_LDTR))
3071 var->type |= 0x1; /* Accessed */
3072
3073 vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var));
3074
3075out:
3076 vmx->emulation_required = emulation_required(vcpu);
3077}
3078
3079static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
3080{
3081 u32 ar = vmx_read_guest_seg_ar(to_vmx(vcpu), VCPU_SREG_CS);
3082
3083 *db = (ar >> 14) & 1;
3084 *l = (ar >> 13) & 1;
3085}
3086
3087static void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
3088{
3089 dt->size = vmcs_read32(GUEST_IDTR_LIMIT);
3090 dt->address = vmcs_readl(GUEST_IDTR_BASE);
3091}
3092
3093static void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
3094{
3095 vmcs_write32(GUEST_IDTR_LIMIT, dt->size);
3096 vmcs_writel(GUEST_IDTR_BASE, dt->address);
3097}
3098
3099static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
3100{
3101 dt->size = vmcs_read32(GUEST_GDTR_LIMIT);
3102 dt->address = vmcs_readl(GUEST_GDTR_BASE);
3103}
3104
3105static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
3106{
3107 vmcs_write32(GUEST_GDTR_LIMIT, dt->size);
3108 vmcs_writel(GUEST_GDTR_BASE, dt->address);
3109}
3110
3111static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg)
3112{
3113 struct kvm_segment var;
3114 u32 ar;
3115
3116 vmx_get_segment(vcpu, &var, seg);
3117 var.dpl = 0x3;
3118 if (seg == VCPU_SREG_CS)
3119 var.type = 0x3;
3120 ar = vmx_segment_access_rights(&var);
3121
3122 if (var.base != (var.selector << 4))
3123 return false;
3124 if (var.limit != 0xffff)
3125 return false;
3126 if (ar != 0xf3)
3127 return false;
3128
3129 return true;
3130}
3131
3132static bool code_segment_valid(struct kvm_vcpu *vcpu)
3133{
3134 struct kvm_segment cs;
3135 unsigned int cs_rpl;
3136
3137 vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
3138 cs_rpl = cs.selector & SEGMENT_RPL_MASK;
3139
3140 if (cs.unusable)
3141 return false;
3142 if (~cs.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_ACCESSES_MASK))
3143 return false;
3144 if (!cs.s)
3145 return false;
3146 if (cs.type & VMX_AR_TYPE_WRITEABLE_MASK) {
3147 if (cs.dpl > cs_rpl)
3148 return false;
3149 } else {
3150 if (cs.dpl != cs_rpl)
3151 return false;
3152 }
3153 if (!cs.present)
3154 return false;
3155
3156 /* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */
3157 return true;
3158}
3159
3160static bool stack_segment_valid(struct kvm_vcpu *vcpu)
3161{
3162 struct kvm_segment ss;
3163 unsigned int ss_rpl;
3164
3165 vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
3166 ss_rpl = ss.selector & SEGMENT_RPL_MASK;
3167
3168 if (ss.unusable)
3169 return true;
3170 if (ss.type != 3 && ss.type != 7)
3171 return false;
3172 if (!ss.s)
3173 return false;
3174 if (ss.dpl != ss_rpl) /* DPL != RPL */
3175 return false;
3176 if (!ss.present)
3177 return false;
3178
3179 return true;
3180}
3181
3182static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg)
3183{
3184 struct kvm_segment var;
3185 unsigned int rpl;
3186
3187 vmx_get_segment(vcpu, &var, seg);
3188 rpl = var.selector & SEGMENT_RPL_MASK;
3189
3190 if (var.unusable)
3191 return true;
3192 if (!var.s)
3193 return false;
3194 if (!var.present)
3195 return false;
3196 if (~var.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_WRITEABLE_MASK)) {
3197 if (var.dpl < rpl) /* DPL < RPL */
3198 return false;
3199 }
3200
3201 /* TODO: Add other members to kvm_segment_field to allow checking for other access
3202 * rights flags
3203 */
3204 return true;
3205}
3206
3207static bool tr_valid(struct kvm_vcpu *vcpu)
3208{
3209 struct kvm_segment tr;
3210
3211 vmx_get_segment(vcpu, &tr, VCPU_SREG_TR);
3212
3213 if (tr.unusable)
3214 return false;
3215 if (tr.selector & SEGMENT_TI_MASK) /* TI = 1 */
3216 return false;
3217 if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */
3218 return false;
3219 if (!tr.present)
3220 return false;
3221
3222 return true;
3223}
3224
3225static bool ldtr_valid(struct kvm_vcpu *vcpu)
3226{
3227 struct kvm_segment ldtr;
3228
3229 vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR);
3230
3231 if (ldtr.unusable)
3232 return true;
3233 if (ldtr.selector & SEGMENT_TI_MASK) /* TI = 1 */
3234 return false;
3235 if (ldtr.type != 2)
3236 return false;
3237 if (!ldtr.present)
3238 return false;
3239
3240 return true;
3241}
3242
3243static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu)
3244{
3245 struct kvm_segment cs, ss;
3246
3247 vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
3248 vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
3249
3250 return ((cs.selector & SEGMENT_RPL_MASK) ==
3251 (ss.selector & SEGMENT_RPL_MASK));
3252}
3253
3254/*
3255 * Check if guest state is valid. Returns true if valid, false if
3256 * not.
3257 * We assume that registers are always usable
3258 */
3259static bool guest_state_valid(struct kvm_vcpu *vcpu)
3260{
3261 if (enable_unrestricted_guest)
3262 return true;
3263
3264 /* real mode guest state checks */
3265 if (!is_protmode(vcpu) || (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) {
3266 if (!rmode_segment_valid(vcpu, VCPU_SREG_CS))
3267 return false;
3268 if (!rmode_segment_valid(vcpu, VCPU_SREG_SS))
3269 return false;
3270 if (!rmode_segment_valid(vcpu, VCPU_SREG_DS))
3271 return false;
3272 if (!rmode_segment_valid(vcpu, VCPU_SREG_ES))
3273 return false;
3274 if (!rmode_segment_valid(vcpu, VCPU_SREG_FS))
3275 return false;
3276 if (!rmode_segment_valid(vcpu, VCPU_SREG_GS))
3277 return false;
3278 } else {
3279 /* protected mode guest state checks */
3280 if (!cs_ss_rpl_check(vcpu))
3281 return false;
3282 if (!code_segment_valid(vcpu))
3283 return false;
3284 if (!stack_segment_valid(vcpu))
3285 return false;
3286 if (!data_segment_valid(vcpu, VCPU_SREG_DS))
3287 return false;
3288 if (!data_segment_valid(vcpu, VCPU_SREG_ES))
3289 return false;
3290 if (!data_segment_valid(vcpu, VCPU_SREG_FS))
3291 return false;
3292 if (!data_segment_valid(vcpu, VCPU_SREG_GS))
3293 return false;
3294 if (!tr_valid(vcpu))
3295 return false;
3296 if (!ldtr_valid(vcpu))
3297 return false;
3298 }
3299 /* TODO:
3300 * - Add checks on RIP
3301 * - Add checks on RFLAGS
3302 */
3303
3304 return true;
3305}
3306
3307static int init_rmode_tss(struct kvm *kvm)
3308{
3309 gfn_t fn;
3310 u16 data = 0;
3311 int idx, r;
3312
3313 idx = srcu_read_lock(&kvm->srcu);
3314 fn = to_kvm_vmx(kvm)->tss_addr >> PAGE_SHIFT;
3315 r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
3316 if (r < 0)
3317 goto out;
3318 data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
3319 r = kvm_write_guest_page(kvm, fn++, &data,
3320 TSS_IOPB_BASE_OFFSET, sizeof(u16));
3321 if (r < 0)
3322 goto out;
3323 r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE);
3324 if (r < 0)
3325 goto out;
3326 r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
3327 if (r < 0)
3328 goto out;
3329 data = ~0;
3330 r = kvm_write_guest_page(kvm, fn, &data,
3331 RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1,
3332 sizeof(u8));
3333out:
3334 srcu_read_unlock(&kvm->srcu, idx);
3335 return r;
3336}
3337
3338static int init_rmode_identity_map(struct kvm *kvm)
3339{
3340 struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
3341 int i, idx, r = 0;
3342 kvm_pfn_t identity_map_pfn;
3343 u32 tmp;
3344
3345 /* Protect kvm_vmx->ept_identity_pagetable_done. */
3346 mutex_lock(&kvm->slots_lock);
3347
3348 if (likely(kvm_vmx->ept_identity_pagetable_done))
3349 goto out2;
3350
3351 if (!kvm_vmx->ept_identity_map_addr)
3352 kvm_vmx->ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR;
3353 identity_map_pfn = kvm_vmx->ept_identity_map_addr >> PAGE_SHIFT;
3354
3355 r = __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT,
3356 kvm_vmx->ept_identity_map_addr, PAGE_SIZE);
3357 if (r < 0)
3358 goto out2;
3359
3360 idx = srcu_read_lock(&kvm->srcu);
3361 r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE);
3362 if (r < 0)
3363 goto out;
3364 /* Set up identity-mapping pagetable for EPT in real mode */
3365 for (i = 0; i < PT32_ENT_PER_PAGE; i++) {
3366 tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |
3367 _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
3368 r = kvm_write_guest_page(kvm, identity_map_pfn,
3369 &tmp, i * sizeof(tmp), sizeof(tmp));
3370 if (r < 0)
3371 goto out;
3372 }
3373 kvm_vmx->ept_identity_pagetable_done = true;
3374
3375out:
3376 srcu_read_unlock(&kvm->srcu, idx);
3377
3378out2:
3379 mutex_unlock(&kvm->slots_lock);
3380 return r;
3381}
3382
3383static void seg_setup(int seg)
3384{
3385 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
3386 unsigned int ar;
3387
3388 vmcs_write16(sf->selector, 0);
3389 vmcs_writel(sf->base, 0);
3390 vmcs_write32(sf->limit, 0xffff);
3391 ar = 0x93;
3392 if (seg == VCPU_SREG_CS)
3393 ar |= 0x08; /* code segment */
3394
3395 vmcs_write32(sf->ar_bytes, ar);
3396}
3397
3398static int alloc_apic_access_page(struct kvm *kvm)
3399{
3400 struct page *page;
3401 int r = 0;
3402
3403 mutex_lock(&kvm->slots_lock);
3404 if (kvm->arch.apic_access_page_done)
3405 goto out;
3406 r = __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
3407 APIC_DEFAULT_PHYS_BASE, PAGE_SIZE);
3408 if (r)
3409 goto out;
3410
3411 page = gfn_to_page(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
3412 if (is_error_page(page)) {
3413 r = -EFAULT;
3414 goto out;
3415 }
3416
3417 /*
3418 * Do not pin the page in memory, so that memory hot-unplug
3419 * is able to migrate it.
3420 */
3421 put_page(page);
3422 kvm->arch.apic_access_page_done = true;
3423out:
3424 mutex_unlock(&kvm->slots_lock);
3425 return r;
3426}
3427
3428int allocate_vpid(void)
3429{
3430 int vpid;
3431
3432 if (!enable_vpid)
3433 return 0;
3434 spin_lock(&vmx_vpid_lock);
3435 vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS);
3436 if (vpid < VMX_NR_VPIDS)
3437 __set_bit(vpid, vmx_vpid_bitmap);
3438 else
3439 vpid = 0;
3440 spin_unlock(&vmx_vpid_lock);
3441 return vpid;
3442}
3443
3444void free_vpid(int vpid)
3445{
3446 if (!enable_vpid || vpid == 0)
3447 return;
3448 spin_lock(&vmx_vpid_lock);
3449 __clear_bit(vpid, vmx_vpid_bitmap);
3450 spin_unlock(&vmx_vpid_lock);
3451}
3452
3453static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
3454 u32 msr, int type)
3455{
3456 int f = sizeof(unsigned long);
3457
3458 if (!cpu_has_vmx_msr_bitmap())
3459 return;
3460
3461 if (static_branch_unlikely(&enable_evmcs))
3462 evmcs_touch_msr_bitmap();
3463
3464 /*
3465 * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
3466 * have the write-low and read-high bitmap offsets the wrong way round.
3467 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
3468 */
3469 if (msr <= 0x1fff) {
3470 if (type & MSR_TYPE_R)
3471 /* read-low */
3472 __clear_bit(msr, msr_bitmap + 0x000 / f);
3473
3474 if (type & MSR_TYPE_W)
3475 /* write-low */
3476 __clear_bit(msr, msr_bitmap + 0x800 / f);
3477
3478 } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
3479 msr &= 0x1fff;
3480 if (type & MSR_TYPE_R)
3481 /* read-high */
3482 __clear_bit(msr, msr_bitmap + 0x400 / f);
3483
3484 if (type & MSR_TYPE_W)
3485 /* write-high */
3486 __clear_bit(msr, msr_bitmap + 0xc00 / f);
3487
3488 }
3489}
3490
3491static __always_inline void vmx_enable_intercept_for_msr(unsigned long *msr_bitmap,
3492 u32 msr, int type)
3493{
3494 int f = sizeof(unsigned long);
3495
3496 if (!cpu_has_vmx_msr_bitmap())
3497 return;
3498
3499 if (static_branch_unlikely(&enable_evmcs))
3500 evmcs_touch_msr_bitmap();
3501
3502 /*
3503 * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
3504 * have the write-low and read-high bitmap offsets the wrong way round.
3505 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
3506 */
3507 if (msr <= 0x1fff) {
3508 if (type & MSR_TYPE_R)
3509 /* read-low */
3510 __set_bit(msr, msr_bitmap + 0x000 / f);
3511
3512 if (type & MSR_TYPE_W)
3513 /* write-low */
3514 __set_bit(msr, msr_bitmap + 0x800 / f);
3515
3516 } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
3517 msr &= 0x1fff;
3518 if (type & MSR_TYPE_R)
3519 /* read-high */
3520 __set_bit(msr, msr_bitmap + 0x400 / f);
3521
3522 if (type & MSR_TYPE_W)
3523 /* write-high */
3524 __set_bit(msr, msr_bitmap + 0xc00 / f);
3525
3526 }
3527}
3528
3529static __always_inline void vmx_set_intercept_for_msr(unsigned long *msr_bitmap,
3530 u32 msr, int type, bool value)
3531{
3532 if (value)
3533 vmx_enable_intercept_for_msr(msr_bitmap, msr, type);
3534 else
3535 vmx_disable_intercept_for_msr(msr_bitmap, msr, type);
3536}
3537
3538static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu)
3539{
3540 u8 mode = 0;
3541
3542 if (cpu_has_secondary_exec_ctrls() &&
3543 (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) &
3544 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
3545 mode |= MSR_BITMAP_MODE_X2APIC;
3546 if (enable_apicv && kvm_vcpu_apicv_active(vcpu))
3547 mode |= MSR_BITMAP_MODE_X2APIC_APICV;
3548 }
3549
3550 return mode;
3551}
3552
3553static void vmx_update_msr_bitmap_x2apic(unsigned long *msr_bitmap,
3554 u8 mode)
3555{
3556 int msr;
3557
3558 for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
3559 unsigned word = msr / BITS_PER_LONG;
3560 msr_bitmap[word] = (mode & MSR_BITMAP_MODE_X2APIC_APICV) ? 0 : ~0;
3561 msr_bitmap[word + (0x800 / sizeof(long))] = ~0;
3562 }
3563
3564 if (mode & MSR_BITMAP_MODE_X2APIC) {
3565 /*
3566 * TPR reads and writes can be virtualized even if virtual interrupt
3567 * delivery is not in use.
3568 */
3569 vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW);
3570 if (mode & MSR_BITMAP_MODE_X2APIC_APICV) {
3571 vmx_enable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_R);
3572 vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_EOI), MSR_TYPE_W);
3573 vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W);
3574 }
3575 }
3576}
3577
3578void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu)
3579{
3580 struct vcpu_vmx *vmx = to_vmx(vcpu);
3581 unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
3582 u8 mode = vmx_msr_bitmap_mode(vcpu);
3583 u8 changed = mode ^ vmx->msr_bitmap_mode;
3584
3585 if (!changed)
3586 return;
3587
3588 if (changed & (MSR_BITMAP_MODE_X2APIC | MSR_BITMAP_MODE_X2APIC_APICV))
3589 vmx_update_msr_bitmap_x2apic(msr_bitmap, mode);
3590
3591 vmx->msr_bitmap_mode = mode;
3592}
3593
3594void pt_update_intercept_for_msr(struct vcpu_vmx *vmx)
3595{
3596 unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
3597 bool flag = !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN);
3598 u32 i;
3599
3600 vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_STATUS,
3601 MSR_TYPE_RW, flag);
3602 vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_OUTPUT_BASE,
3603 MSR_TYPE_RW, flag);
3604 vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_OUTPUT_MASK,
3605 MSR_TYPE_RW, flag);
3606 vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_CR3_MATCH,
3607 MSR_TYPE_RW, flag);
3608 for (i = 0; i < vmx->pt_desc.addr_range; i++) {
3609 vmx_set_intercept_for_msr(msr_bitmap,
3610 MSR_IA32_RTIT_ADDR0_A + i * 2, MSR_TYPE_RW, flag);
3611 vmx_set_intercept_for_msr(msr_bitmap,
3612 MSR_IA32_RTIT_ADDR0_B + i * 2, MSR_TYPE_RW, flag);
3613 }
3614}
3615
3616static bool vmx_get_enable_apicv(struct kvm_vcpu *vcpu)
3617{
3618 return enable_apicv;
3619}
3620
3621static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
3622{
3623 struct vcpu_vmx *vmx = to_vmx(vcpu);
3624 void *vapic_page;
3625 u32 vppr;
3626 int rvi;
3627
3628 if (WARN_ON_ONCE(!is_guest_mode(vcpu)) ||
3629 !nested_cpu_has_vid(get_vmcs12(vcpu)) ||
3630 WARN_ON_ONCE(!vmx->nested.virtual_apic_page))
3631 return false;
3632
3633 rvi = vmx_get_rvi();
3634
3635 vapic_page = kmap(vmx->nested.virtual_apic_page);
3636 vppr = *((u32 *)(vapic_page + APIC_PROCPRI));
3637 kunmap(vmx->nested.virtual_apic_page);
3638
3639 return ((rvi & 0xf0) > (vppr & 0xf0));
3640}
3641
3642static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu,
3643 bool nested)
3644{
3645#ifdef CONFIG_SMP
3646 int pi_vec = nested ? POSTED_INTR_NESTED_VECTOR : POSTED_INTR_VECTOR;
3647
3648 if (vcpu->mode == IN_GUEST_MODE) {
3649 /*
3650 * The vector of interrupt to be delivered to vcpu had
3651 * been set in PIR before this function.
3652 *
3653 * Following cases will be reached in this block, and
3654 * we always send a notification event in all cases as
3655 * explained below.
3656 *
3657 * Case 1: vcpu keeps in non-root mode. Sending a
3658 * notification event posts the interrupt to vcpu.
3659 *
3660 * Case 2: vcpu exits to root mode and is still
3661 * runnable. PIR will be synced to vIRR before the
3662 * next vcpu entry. Sending a notification event in
3663 * this case has no effect, as vcpu is not in root
3664 * mode.
3665 *
3666 * Case 3: vcpu exits to root mode and is blocked.
3667 * vcpu_block() has already synced PIR to vIRR and
3668 * never blocks vcpu if vIRR is not cleared. Therefore,
3669 * a blocked vcpu here does not wait for any requested
3670 * interrupts in PIR, and sending a notification event
3671 * which has no effect is safe here.
3672 */
3673
3674 apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec);
3675 return true;
3676 }
3677#endif
3678 return false;
3679}
3680
3681static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
3682 int vector)
3683{
3684 struct vcpu_vmx *vmx = to_vmx(vcpu);
3685
3686 if (is_guest_mode(vcpu) &&
3687 vector == vmx->nested.posted_intr_nv) {
3688 /*
3689 * If a posted intr is not recognized by hardware,
3690 * we will accomplish it in the next vmentry.
3691 */
3692 vmx->nested.pi_pending = true;
3693 kvm_make_request(KVM_REQ_EVENT, vcpu);
3694 /* the PIR and ON have been set by L1. */
3695 if (!kvm_vcpu_trigger_posted_interrupt(vcpu, true))
3696 kvm_vcpu_kick(vcpu);
3697 return 0;
3698 }
3699 return -1;
3700}
3701/*
3702 * Send interrupt to vcpu via posted interrupt way.
3703 * 1. If target vcpu is running(non-root mode), send posted interrupt
3704 * notification to vcpu and hardware will sync PIR to vIRR atomically.
3705 * 2. If target vcpu isn't running(root mode), kick it to pick up the
3706 * interrupt from PIR in next vmentry.
3707 */
3708static void vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
3709{
3710 struct vcpu_vmx *vmx = to_vmx(vcpu);
3711 int r;
3712
3713 r = vmx_deliver_nested_posted_interrupt(vcpu, vector);
3714 if (!r)
3715 return;
3716
3717 if (pi_test_and_set_pir(vector, &vmx->pi_desc))
3718 return;
3719
3720 /* If a previous notification has sent the IPI, nothing to do. */
3721 if (pi_test_and_set_on(&vmx->pi_desc))
3722 return;
3723
3724 if (!kvm_vcpu_trigger_posted_interrupt(vcpu, false))
3725 kvm_vcpu_kick(vcpu);
3726}
3727
3728/*
3729 * Set up the vmcs's constant host-state fields, i.e., host-state fields that
3730 * will not change in the lifetime of the guest.
3731 * Note that host-state that does change is set elsewhere. E.g., host-state
3732 * that is set differently for each CPU is set in vmx_vcpu_load(), not here.
3733 */
3734void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
3735{
3736 u32 low32, high32;
3737 unsigned long tmpl;
3738 struct desc_ptr dt;
3739 unsigned long cr0, cr3, cr4;
3740
3741 cr0 = read_cr0();
3742 WARN_ON(cr0 & X86_CR0_TS);
3743 vmcs_writel(HOST_CR0, cr0); /* 22.2.3 */
3744
3745 /*
3746 * Save the most likely value for this task's CR3 in the VMCS.
3747 * We can't use __get_current_cr3_fast() because we're not atomic.
3748 */
3749 cr3 = __read_cr3();
3750 vmcs_writel(HOST_CR3, cr3); /* 22.2.3 FIXME: shadow tables */
3751 vmx->loaded_vmcs->host_state.cr3 = cr3;
3752
3753 /* Save the most likely value for this task's CR4 in the VMCS. */
3754 cr4 = cr4_read_shadow();
3755 vmcs_writel(HOST_CR4, cr4); /* 22.2.3, 22.2.5 */
3756 vmx->loaded_vmcs->host_state.cr4 = cr4;
3757
3758 vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */
3759#ifdef CONFIG_X86_64
3760 /*
3761 * Load null selectors, so we can avoid reloading them in
3762 * vmx_prepare_switch_to_host(), in case userspace uses
3763 * the null selectors too (the expected case).
3764 */
3765 vmcs_write16(HOST_DS_SELECTOR, 0);
3766 vmcs_write16(HOST_ES_SELECTOR, 0);
3767#else
3768 vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
3769 vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */
3770#endif
3771 vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
3772 vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */
3773
3774 store_idt(&dt);
3775 vmcs_writel(HOST_IDTR_BASE, dt.address); /* 22.2.4 */
3776 vmx->host_idt_base = dt.address;
3777
3778 vmcs_writel(HOST_RIP, (unsigned long)vmx_vmexit); /* 22.2.5 */
3779
3780 rdmsr(MSR_IA32_SYSENTER_CS, low32, high32);
3781 vmcs_write32(HOST_IA32_SYSENTER_CS, low32);
3782 rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl);
3783 vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl); /* 22.2.3 */
3784
3785 if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) {
3786 rdmsr(MSR_IA32_CR_PAT, low32, high32);
3787 vmcs_write64(HOST_IA32_PAT, low32 | ((u64) high32 << 32));
3788 }
3789
3790 if (cpu_has_load_ia32_efer())
3791 vmcs_write64(HOST_IA32_EFER, host_efer);
3792}
3793
3794void set_cr4_guest_host_mask(struct vcpu_vmx *vmx)
3795{
3796 vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS;
3797 if (enable_ept)
3798 vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE;
3799 if (is_guest_mode(&vmx->vcpu))
3800 vmx->vcpu.arch.cr4_guest_owned_bits &=
3801 ~get_vmcs12(&vmx->vcpu)->cr4_guest_host_mask;
3802 vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits);
3803}
3804
3805static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
3806{
3807 u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl;
3808
3809 if (!kvm_vcpu_apicv_active(&vmx->vcpu))
3810 pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;
3811
3812 if (!enable_vnmi)
3813 pin_based_exec_ctrl &= ~PIN_BASED_VIRTUAL_NMIS;
3814
3815 /* Enable the preemption timer dynamically */
3816 pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
3817 return pin_based_exec_ctrl;
3818}
3819
3820static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
3821{
3822 struct vcpu_vmx *vmx = to_vmx(vcpu);
3823
3824 vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx));
3825 if (cpu_has_secondary_exec_ctrls()) {
3826 if (kvm_vcpu_apicv_active(vcpu))
3827 vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
3828 SECONDARY_EXEC_APIC_REGISTER_VIRT |
3829 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
3830 else
3831 vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL,
3832 SECONDARY_EXEC_APIC_REGISTER_VIRT |
3833 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
3834 }
3835
3836 if (cpu_has_vmx_msr_bitmap())
3837 vmx_update_msr_bitmap(vcpu);
3838}
3839
3840u32 vmx_exec_control(struct vcpu_vmx *vmx)
3841{
3842 u32 exec_control = vmcs_config.cpu_based_exec_ctrl;
3843
3844 if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)
3845 exec_control &= ~CPU_BASED_MOV_DR_EXITING;
3846
3847 if (!cpu_need_tpr_shadow(&vmx->vcpu)) {
3848 exec_control &= ~CPU_BASED_TPR_SHADOW;
3849#ifdef CONFIG_X86_64
3850 exec_control |= CPU_BASED_CR8_STORE_EXITING |
3851 CPU_BASED_CR8_LOAD_EXITING;
3852#endif
3853 }
3854 if (!enable_ept)
3855 exec_control |= CPU_BASED_CR3_STORE_EXITING |
3856 CPU_BASED_CR3_LOAD_EXITING |
3857 CPU_BASED_INVLPG_EXITING;
3858 if (kvm_mwait_in_guest(vmx->vcpu.kvm))
3859 exec_control &= ~(CPU_BASED_MWAIT_EXITING |
3860 CPU_BASED_MONITOR_EXITING);
3861 if (kvm_hlt_in_guest(vmx->vcpu.kvm))
3862 exec_control &= ~CPU_BASED_HLT_EXITING;
3863 return exec_control;
3864}
3865
3866
3867static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
3868{
3869 struct kvm_vcpu *vcpu = &vmx->vcpu;
3870
3871 u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
3872
3873 if (pt_mode == PT_MODE_SYSTEM)
3874 exec_control &= ~(SECONDARY_EXEC_PT_USE_GPA | SECONDARY_EXEC_PT_CONCEAL_VMX);
3875 if (!cpu_need_virtualize_apic_accesses(vcpu))
3876 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
3877 if (vmx->vpid == 0)
3878 exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
3879 if (!enable_ept) {
3880 exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
3881 enable_unrestricted_guest = 0;
3882 }
3883 if (!enable_unrestricted_guest)
3884 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
3885 if (kvm_pause_in_guest(vmx->vcpu.kvm))
3886 exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
3887 if (!kvm_vcpu_apicv_active(vcpu))
3888 exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT |
3889 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
3890 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
3891
3892 /* SECONDARY_EXEC_DESC is enabled/disabled on writes to CR4.UMIP,
3893 * in vmx_set_cr4. */
3894 exec_control &= ~SECONDARY_EXEC_DESC;
3895
3896 /* SECONDARY_EXEC_SHADOW_VMCS is enabled when L1 executes VMPTRLD
3897 (handle_vmptrld).
3898 We can NOT enable shadow_vmcs here because we don't have yet
3899 a current VMCS12
3900 */
3901 exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
3902
3903 if (!enable_pml)
3904 exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
3905
3906 if (vmx_xsaves_supported()) {
3907 /* Exposing XSAVES only when XSAVE is exposed */
3908 bool xsaves_enabled =
3909 guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
3910 guest_cpuid_has(vcpu, X86_FEATURE_XSAVES);
3911
3912 if (!xsaves_enabled)
3913 exec_control &= ~SECONDARY_EXEC_XSAVES;
3914
3915 if (nested) {
3916 if (xsaves_enabled)
3917 vmx->nested.msrs.secondary_ctls_high |=
3918 SECONDARY_EXEC_XSAVES;
3919 else
3920 vmx->nested.msrs.secondary_ctls_high &=
3921 ~SECONDARY_EXEC_XSAVES;
3922 }
3923 }
3924
3925 if (vmx_rdtscp_supported()) {
3926 bool rdtscp_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP);
3927 if (!rdtscp_enabled)
3928 exec_control &= ~SECONDARY_EXEC_RDTSCP;
3929
3930 if (nested) {
3931 if (rdtscp_enabled)
3932 vmx->nested.msrs.secondary_ctls_high |=
3933 SECONDARY_EXEC_RDTSCP;
3934 else
3935 vmx->nested.msrs.secondary_ctls_high &=
3936 ~SECONDARY_EXEC_RDTSCP;
3937 }
3938 }
3939
3940 if (vmx_invpcid_supported()) {
3941 /* Exposing INVPCID only when PCID is exposed */
3942 bool invpcid_enabled =
3943 guest_cpuid_has(vcpu, X86_FEATURE_INVPCID) &&
3944 guest_cpuid_has(vcpu, X86_FEATURE_PCID);
3945
3946 if (!invpcid_enabled) {
3947 exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID;
3948 guest_cpuid_clear(vcpu, X86_FEATURE_INVPCID);
3949 }
3950
3951 if (nested) {
3952 if (invpcid_enabled)
3953 vmx->nested.msrs.secondary_ctls_high |=
3954 SECONDARY_EXEC_ENABLE_INVPCID;
3955 else
3956 vmx->nested.msrs.secondary_ctls_high &=
3957 ~SECONDARY_EXEC_ENABLE_INVPCID;
3958 }
3959 }
3960
3961 if (vmx_rdrand_supported()) {
3962 bool rdrand_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDRAND);
3963 if (rdrand_enabled)
3964 exec_control &= ~SECONDARY_EXEC_RDRAND_EXITING;
3965
3966 if (nested) {
3967 if (rdrand_enabled)
3968 vmx->nested.msrs.secondary_ctls_high |=
3969 SECONDARY_EXEC_RDRAND_EXITING;
3970 else
3971 vmx->nested.msrs.secondary_ctls_high &=
3972 ~SECONDARY_EXEC_RDRAND_EXITING;
3973 }
3974 }
3975
3976 if (vmx_rdseed_supported()) {
3977 bool rdseed_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDSEED);
3978 if (rdseed_enabled)
3979 exec_control &= ~SECONDARY_EXEC_RDSEED_EXITING;
3980
3981 if (nested) {
3982 if (rdseed_enabled)
3983 vmx->nested.msrs.secondary_ctls_high |=
3984 SECONDARY_EXEC_RDSEED_EXITING;
3985 else
3986 vmx->nested.msrs.secondary_ctls_high &=
3987 ~SECONDARY_EXEC_RDSEED_EXITING;
3988 }
3989 }
3990
3991 vmx->secondary_exec_control = exec_control;
3992}
3993
3994static void ept_set_mmio_spte_mask(void)
3995{
3996 /*
3997 * EPT Misconfigurations can be generated if the value of bits 2:0
3998 * of an EPT paging-structure entry is 110b (write/execute).
3999 */
4000 kvm_mmu_set_mmio_spte_mask(VMX_EPT_RWX_MASK,
4001 VMX_EPT_MISCONFIG_WX_VALUE);
4002}
4003
4004#define VMX_XSS_EXIT_BITMAP 0
4005
4006/*
4007 * Sets up the vmcs for emulated real mode.
4008 */
4009static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
4010{
4011 int i;
4012
4013 if (nested)
4014 nested_vmx_vcpu_setup();
4015
4016 if (cpu_has_vmx_msr_bitmap())
4017 vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap));
4018
4019 vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
4020
4021 /* Control */
4022 vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx));
4023 vmx->hv_deadline_tsc = -1;
4024
4025 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx));
4026
4027 if (cpu_has_secondary_exec_ctrls()) {
4028 vmx_compute_secondary_exec_control(vmx);
4029 vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
4030 vmx->secondary_exec_control);
4031 }
4032
4033 if (kvm_vcpu_apicv_active(&vmx->vcpu)) {
4034 vmcs_write64(EOI_EXIT_BITMAP0, 0);
4035 vmcs_write64(EOI_EXIT_BITMAP1, 0);
4036 vmcs_write64(EOI_EXIT_BITMAP2, 0);
4037 vmcs_write64(EOI_EXIT_BITMAP3, 0);
4038
4039 vmcs_write16(GUEST_INTR_STATUS, 0);
4040
4041 vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR);
4042 vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc)));
4043 }
4044
4045 if (!kvm_pause_in_guest(vmx->vcpu.kvm)) {
4046 vmcs_write32(PLE_GAP, ple_gap);
4047 vmx->ple_window = ple_window;
4048 vmx->ple_window_dirty = true;
4049 }
4050
4051 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
4052 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
4053 vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */
4054
4055 vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */
4056 vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */
4057 vmx_set_constant_host_state(vmx);
4058 vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */
4059 vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */
4060
4061 if (cpu_has_vmx_vmfunc())
4062 vmcs_write64(VM_FUNCTION_CONTROL, 0);
4063
4064 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
4065 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
4066 vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val));
4067 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
4068 vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val));
4069
4070 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
4071 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
4072
4073 for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) {
4074 u32 index = vmx_msr_index[i];
4075 u32 data_low, data_high;
4076 int j = vmx->nmsrs;
4077
4078 if (rdmsr_safe(index, &data_low, &data_high) < 0)
4079 continue;
4080 if (wrmsr_safe(index, data_low, data_high) < 0)
4081 continue;
4082 vmx->guest_msrs[j].index = i;
4083 vmx->guest_msrs[j].data = 0;
4084 vmx->guest_msrs[j].mask = -1ull;
4085 ++vmx->nmsrs;
4086 }
4087
4088 vmx->arch_capabilities = kvm_get_arch_capabilities();
4089
4090 vm_exit_controls_init(vmx, vmx_vmexit_ctrl());
4091
4092 /* 22.2.1, 20.8.1 */
4093 vm_entry_controls_init(vmx, vmx_vmentry_ctrl());
4094
4095 vmx->vcpu.arch.cr0_guest_owned_bits = X86_CR0_TS;
4096 vmcs_writel(CR0_GUEST_HOST_MASK, ~X86_CR0_TS);
4097
4098 set_cr4_guest_host_mask(vmx);
4099
4100 if (vmx_xsaves_supported())
4101 vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP);
4102
4103 if (enable_pml) {
4104 vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
4105 vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
4106 }
4107
4108 if (cpu_has_vmx_encls_vmexit())
4109 vmcs_write64(ENCLS_EXITING_BITMAP, -1ull);
4110
4111 if (pt_mode == PT_MODE_HOST_GUEST) {
4112 memset(&vmx->pt_desc, 0, sizeof(vmx->pt_desc));
4113 /* Bit[6~0] are forced to 1, writes are ignored. */
4114 vmx->pt_desc.guest.output_mask = 0x7F;
4115 vmcs_write64(GUEST_IA32_RTIT_CTL, 0);
4116 }
4117}
4118
4119static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
4120{
4121 struct vcpu_vmx *vmx = to_vmx(vcpu);
4122 struct msr_data apic_base_msr;
4123 u64 cr0;
4124
4125 vmx->rmode.vm86_active = 0;
4126 vmx->spec_ctrl = 0;
4127
4128 vcpu->arch.microcode_version = 0x100000000ULL;
4129 vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
4130 kvm_set_cr8(vcpu, 0);
4131
4132 if (!init_event) {
4133 apic_base_msr.data = APIC_DEFAULT_PHYS_BASE |
4134 MSR_IA32_APICBASE_ENABLE;
4135 if (kvm_vcpu_is_reset_bsp(vcpu))
4136 apic_base_msr.data |= MSR_IA32_APICBASE_BSP;
4137 apic_base_msr.host_initiated = true;
4138 kvm_set_apic_base(vcpu, &apic_base_msr);
4139 }
4140
4141 vmx_segment_cache_clear(vmx);
4142
4143 seg_setup(VCPU_SREG_CS);
4144 vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
4145 vmcs_writel(GUEST_CS_BASE, 0xffff0000ul);
4146
4147 seg_setup(VCPU_SREG_DS);
4148 seg_setup(VCPU_SREG_ES);
4149 seg_setup(VCPU_SREG_FS);
4150 seg_setup(VCPU_SREG_GS);
4151 seg_setup(VCPU_SREG_SS);
4152
4153 vmcs_write16(GUEST_TR_SELECTOR, 0);
4154 vmcs_writel(GUEST_TR_BASE, 0);
4155 vmcs_write32(GUEST_TR_LIMIT, 0xffff);
4156 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
4157
4158 vmcs_write16(GUEST_LDTR_SELECTOR, 0);
4159 vmcs_writel(GUEST_LDTR_BASE, 0);
4160 vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
4161 vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
4162
4163 if (!init_event) {
4164 vmcs_write32(GUEST_SYSENTER_CS, 0);
4165 vmcs_writel(GUEST_SYSENTER_ESP, 0);
4166 vmcs_writel(GUEST_SYSENTER_EIP, 0);
4167 vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
4168 }
4169
4170 kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
4171 kvm_rip_write(vcpu, 0xfff0);
4172
4173 vmcs_writel(GUEST_GDTR_BASE, 0);
4174 vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
4175
4176 vmcs_writel(GUEST_IDTR_BASE, 0);
4177 vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
4178
4179 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
4180 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
4181 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 0);
4182 if (kvm_mpx_supported())
4183 vmcs_write64(GUEST_BNDCFGS, 0);
4184
4185 setup_msrs(vmx);
4186
4187 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */
4188
4189 if (cpu_has_vmx_tpr_shadow() && !init_event) {
4190 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
4191 if (cpu_need_tpr_shadow(vcpu))
4192 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
4193 __pa(vcpu->arch.apic->regs));
4194 vmcs_write32(TPR_THRESHOLD, 0);
4195 }
4196
4197 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
4198
4199 if (vmx->vpid != 0)
4200 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
4201
4202 cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
4203 vmx->vcpu.arch.cr0 = cr0;
4204 vmx_set_cr0(vcpu, cr0); /* enter rmode */
4205 vmx_set_cr4(vcpu, 0);
4206 vmx_set_efer(vcpu, 0);
4207
4208 update_exception_bitmap(vcpu);
4209
4210 vpid_sync_context(vmx->vpid);
4211 if (init_event)
4212 vmx_clear_hlt(vcpu);
4213}
4214
4215static void enable_irq_window(struct kvm_vcpu *vcpu)
4216{
4217 vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL,
4218 CPU_BASED_VIRTUAL_INTR_PENDING);
4219}
4220
4221static void enable_nmi_window(struct kvm_vcpu *vcpu)
4222{
4223 if (!enable_vnmi ||
4224 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
4225 enable_irq_window(vcpu);
4226 return;
4227 }
4228
4229 vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL,
4230 CPU_BASED_VIRTUAL_NMI_PENDING);
4231}
4232
4233static void vmx_inject_irq(struct kvm_vcpu *vcpu)
4234{
4235 struct vcpu_vmx *vmx = to_vmx(vcpu);
4236 uint32_t intr;
4237 int irq = vcpu->arch.interrupt.nr;
4238
4239 trace_kvm_inj_virq(irq);
4240
4241 ++vcpu->stat.irq_injections;
4242 if (vmx->rmode.vm86_active) {
4243 int inc_eip = 0;
4244 if (vcpu->arch.interrupt.soft)
4245 inc_eip = vcpu->arch.event_exit_inst_len;
4246 if (kvm_inject_realmode_interrupt(vcpu, irq, inc_eip) != EMULATE_DONE)
4247 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
4248 return;
4249 }
4250 intr = irq | INTR_INFO_VALID_MASK;
4251 if (vcpu->arch.interrupt.soft) {
4252 intr |= INTR_TYPE_SOFT_INTR;
4253 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
4254 vmx->vcpu.arch.event_exit_inst_len);
4255 } else
4256 intr |= INTR_TYPE_EXT_INTR;
4257 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
4258
4259 vmx_clear_hlt(vcpu);
4260}
4261
4262static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
4263{
4264 struct vcpu_vmx *vmx = to_vmx(vcpu);
4265
4266 if (!enable_vnmi) {
4267 /*
4268 * Tracking the NMI-blocked state in software is built upon
4269 * finding the next open IRQ window. This, in turn, depends on
4270 * well-behaving guests: They have to keep IRQs disabled at
4271 * least as long as the NMI handler runs. Otherwise we may
4272 * cause NMI nesting, maybe breaking the guest. But as this is
4273 * highly unlikely, we can live with the residual risk.
4274 */
4275 vmx->loaded_vmcs->soft_vnmi_blocked = 1;
4276 vmx->loaded_vmcs->vnmi_blocked_time = 0;
4277 }
4278
4279 ++vcpu->stat.nmi_injections;
4280 vmx->loaded_vmcs->nmi_known_unmasked = false;
4281
4282 if (vmx->rmode.vm86_active) {
4283 if (kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0) != EMULATE_DONE)
4284 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
4285 return;
4286 }
4287
4288 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
4289 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
4290
4291 vmx_clear_hlt(vcpu);
4292}
4293
4294bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
4295{
4296 struct vcpu_vmx *vmx = to_vmx(vcpu);
4297 bool masked;
4298
4299 if (!enable_vnmi)
4300 return vmx->loaded_vmcs->soft_vnmi_blocked;
4301 if (vmx->loaded_vmcs->nmi_known_unmasked)
4302 return false;
4303 masked = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI;
4304 vmx->loaded_vmcs->nmi_known_unmasked = !masked;
4305 return masked;
4306}
4307
4308void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
4309{
4310 struct vcpu_vmx *vmx = to_vmx(vcpu);
4311
4312 if (!enable_vnmi) {
4313 if (vmx->loaded_vmcs->soft_vnmi_blocked != masked) {
4314 vmx->loaded_vmcs->soft_vnmi_blocked = masked;
4315 vmx->loaded_vmcs->vnmi_blocked_time = 0;
4316 }
4317 } else {
4318 vmx->loaded_vmcs->nmi_known_unmasked = !masked;
4319 if (masked)
4320 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
4321 GUEST_INTR_STATE_NMI);
4322 else
4323 vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
4324 GUEST_INTR_STATE_NMI);
4325 }
4326}
4327
4328static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
4329{
4330 if (to_vmx(vcpu)->nested.nested_run_pending)
4331 return 0;
4332
4333 if (!enable_vnmi &&
4334 to_vmx(vcpu)->loaded_vmcs->soft_vnmi_blocked)
4335 return 0;
4336
4337 return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
4338 (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI
4339 | GUEST_INTR_STATE_NMI));
4340}
4341
4342static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
4343{
4344 return (!to_vmx(vcpu)->nested.nested_run_pending &&
4345 vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
4346 !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
4347 (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
4348}
4349
4350static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
4351{
4352 int ret;
4353
4354 if (enable_unrestricted_guest)
4355 return 0;
4356
4357 ret = x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, addr,
4358 PAGE_SIZE * 3);
4359 if (ret)
4360 return ret;
4361 to_kvm_vmx(kvm)->tss_addr = addr;
4362 return init_rmode_tss(kvm);
4363}
4364
4365static int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr)
4366{
4367 to_kvm_vmx(kvm)->ept_identity_map_addr = ident_addr;
4368 return 0;
4369}
4370
4371static bool rmode_exception(struct kvm_vcpu *vcpu, int vec)
4372{
4373 switch (vec) {
4374 case BP_VECTOR:
4375 /*
4376 * Update instruction length as we may reinject the exception
4377 * from user space while in guest debugging mode.
4378 */
4379 to_vmx(vcpu)->vcpu.arch.event_exit_inst_len =
4380 vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
4381 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
4382 return false;
4383 /* fall through */
4384 case DB_VECTOR:
4385 if (vcpu->guest_debug &
4386 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
4387 return false;
4388 /* fall through */
4389 case DE_VECTOR:
4390 case OF_VECTOR:
4391 case BR_VECTOR:
4392 case UD_VECTOR:
4393 case DF_VECTOR:
4394 case SS_VECTOR:
4395 case GP_VECTOR:
4396 case MF_VECTOR:
4397 return true;
4398 break;
4399 }
4400 return false;
4401}
4402
4403static int handle_rmode_exception(struct kvm_vcpu *vcpu,
4404 int vec, u32 err_code)
4405{
4406 /*
4407 * Instruction with address size override prefix opcode 0x67
4408 * Cause the #SS fault with 0 error code in VM86 mode.
4409 */
4410 if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) {
4411 if (kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE) {
4412 if (vcpu->arch.halt_request) {
4413 vcpu->arch.halt_request = 0;
4414 return kvm_vcpu_halt(vcpu);
4415 }
4416 return 1;
4417 }
4418 return 0;
4419 }
4420
4421 /*
4422 * Forward all other exceptions that are valid in real mode.
4423 * FIXME: Breaks guest debugging in real mode, needs to be fixed with
4424 * the required debugging infrastructure rework.
4425 */
4426 kvm_queue_exception(vcpu, vec);
4427 return 1;
4428}
4429
4430/*
4431 * Trigger machine check on the host. We assume all the MSRs are already set up
4432 * by the CPU and that we still run on the same CPU as the MCE occurred on.
4433 * We pass a fake environment to the machine check handler because we want
4434 * the guest to be always treated like user space, no matter what context
4435 * it used internally.
4436 */
4437static void kvm_machine_check(void)
4438{
4439#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_64)
4440 struct pt_regs regs = {
4441 .cs = 3, /* Fake ring 3 no matter what the guest ran on */
4442 .flags = X86_EFLAGS_IF,
4443 };
4444
4445 do_machine_check(&regs, 0);
4446#endif
4447}
4448
4449static int handle_machine_check(struct kvm_vcpu *vcpu)
4450{
4451 /* already handled by vcpu_run */
4452 return 1;
4453}
4454
4455static int handle_exception(struct kvm_vcpu *vcpu)
4456{
4457 struct vcpu_vmx *vmx = to_vmx(vcpu);
4458 struct kvm_run *kvm_run = vcpu->run;
4459 u32 intr_info, ex_no, error_code;
4460 unsigned long cr2, rip, dr6;
4461 u32 vect_info;
4462 enum emulation_result er;
4463
4464 vect_info = vmx->idt_vectoring_info;
4465 intr_info = vmx->exit_intr_info;
4466
4467 if (is_machine_check(intr_info))
4468 return handle_machine_check(vcpu);
4469
4470 if (is_nmi(intr_info))
4471 return 1; /* already handled by vmx_vcpu_run() */
4472
4473 if (is_invalid_opcode(intr_info))
4474 return handle_ud(vcpu);
4475
4476 error_code = 0;
4477 if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
4478 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
4479
4480 if (!vmx->rmode.vm86_active && is_gp_fault(intr_info)) {
4481 WARN_ON_ONCE(!enable_vmware_backdoor);
4482 er = kvm_emulate_instruction(vcpu,
4483 EMULTYPE_VMWARE | EMULTYPE_NO_UD_ON_FAIL);
4484 if (er == EMULATE_USER_EXIT)
4485 return 0;
4486 else if (er != EMULATE_DONE)
4487 kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
4488 return 1;
4489 }
4490
4491 /*
4492 * The #PF with PFEC.RSVD = 1 indicates the guest is accessing
4493 * MMIO, it is better to report an internal error.
4494 * See the comments in vmx_handle_exit.
4495 */
4496 if ((vect_info & VECTORING_INFO_VALID_MASK) &&
4497 !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) {
4498 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
4499 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX;
4500 vcpu->run->internal.ndata = 3;
4501 vcpu->run->internal.data[0] = vect_info;
4502 vcpu->run->internal.data[1] = intr_info;
4503 vcpu->run->internal.data[2] = error_code;
4504 return 0;
4505 }
4506
4507 if (is_page_fault(intr_info)) {
4508 cr2 = vmcs_readl(EXIT_QUALIFICATION);
4509 /* EPT won't cause page fault directly */
4510 WARN_ON_ONCE(!vcpu->arch.apf.host_apf_reason && enable_ept);
4511 return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0);
4512 }
4513
4514 ex_no = intr_info & INTR_INFO_VECTOR_MASK;
4515
4516 if (vmx->rmode.vm86_active && rmode_exception(vcpu, ex_no))
4517 return handle_rmode_exception(vcpu, ex_no, error_code);
4518
4519 switch (ex_no) {
4520 case AC_VECTOR:
4521 kvm_queue_exception_e(vcpu, AC_VECTOR, error_code);
4522 return 1;
4523 case DB_VECTOR:
4524 dr6 = vmcs_readl(EXIT_QUALIFICATION);
4525 if (!(vcpu->guest_debug &
4526 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) {
4527 vcpu->arch.dr6 &= ~15;
4528 vcpu->arch.dr6 |= dr6 | DR6_RTM;
4529 if (is_icebp(intr_info))
4530 skip_emulated_instruction(vcpu);
4531
4532 kvm_queue_exception(vcpu, DB_VECTOR);
4533 return 1;
4534 }
4535 kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1;
4536 kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7);
4537 /* fall through */
4538 case BP_VECTOR:
4539 /*
4540 * Update instruction length as we may reinject #BP from
4541 * user space while in guest debugging mode. Reading it for
4542 * #DB as well causes no harm, it is not used in that case.
4543 */
4544 vmx->vcpu.arch.event_exit_inst_len =
4545 vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
4546 kvm_run->exit_reason = KVM_EXIT_DEBUG;
4547 rip = kvm_rip_read(vcpu);
4548 kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip;
4549 kvm_run->debug.arch.exception = ex_no;
4550 break;
4551 default:
4552 kvm_run->exit_reason = KVM_EXIT_EXCEPTION;
4553 kvm_run->ex.exception = ex_no;
4554 kvm_run->ex.error_code = error_code;
4555 break;
4556 }
4557 return 0;
4558}
4559
4560static int handle_external_interrupt(struct kvm_vcpu *vcpu)
4561{
4562 ++vcpu->stat.irq_exits;
4563 return 1;
4564}
4565
4566static int handle_triple_fault(struct kvm_vcpu *vcpu)
4567{
4568 vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
4569 vcpu->mmio_needed = 0;
4570 return 0;
4571}
4572
4573static int handle_io(struct kvm_vcpu *vcpu)
4574{
4575 unsigned long exit_qualification;
4576 int size, in, string;
4577 unsigned port;
4578
4579 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
4580 string = (exit_qualification & 16) != 0;
4581
4582 ++vcpu->stat.io_exits;
4583
4584 if (string)
4585 return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE;
4586
4587 port = exit_qualification >> 16;
4588 size = (exit_qualification & 7) + 1;
4589 in = (exit_qualification & 8) != 0;
4590
4591 return kvm_fast_pio(vcpu, size, port, in);
4592}
4593
4594static void
4595vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
4596{
4597 /*
4598 * Patch in the VMCALL instruction:
4599 */
4600 hypercall[0] = 0x0f;
4601 hypercall[1] = 0x01;
4602 hypercall[2] = 0xc1;
4603}
4604
4605/* called to set cr0 as appropriate for a mov-to-cr0 exit. */
4606static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
4607{
4608 if (is_guest_mode(vcpu)) {
4609 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
4610 unsigned long orig_val = val;
4611
4612 /*
4613 * We get here when L2 changed cr0 in a way that did not change
4614 * any of L1's shadowed bits (see nested_vmx_exit_handled_cr),
4615 * but did change L0 shadowed bits. So we first calculate the
4616 * effective cr0 value that L1 would like to write into the
4617 * hardware. It consists of the L2-owned bits from the new
4618 * value combined with the L1-owned bits from L1's guest_cr0.
4619 */
4620 val = (val & ~vmcs12->cr0_guest_host_mask) |
4621 (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask);
4622
4623 if (!nested_guest_cr0_valid(vcpu, val))
4624 return 1;
4625
4626 if (kvm_set_cr0(vcpu, val))
4627 return 1;
4628 vmcs_writel(CR0_READ_SHADOW, orig_val);
4629 return 0;
4630 } else {
4631 if (to_vmx(vcpu)->nested.vmxon &&
4632 !nested_host_cr0_valid(vcpu, val))
4633 return 1;
4634
4635 return kvm_set_cr0(vcpu, val);
4636 }
4637}
4638
4639static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val)
4640{
4641 if (is_guest_mode(vcpu)) {
4642 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
4643 unsigned long orig_val = val;
4644
4645 /* analogously to handle_set_cr0 */
4646 val = (val & ~vmcs12->cr4_guest_host_mask) |
4647 (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask);
4648 if (kvm_set_cr4(vcpu, val))
4649 return 1;
4650 vmcs_writel(CR4_READ_SHADOW, orig_val);
4651 return 0;
4652 } else
4653 return kvm_set_cr4(vcpu, val);
4654}
4655
4656static int handle_desc(struct kvm_vcpu *vcpu)
4657{
4658 WARN_ON(!(vcpu->arch.cr4 & X86_CR4_UMIP));
4659 return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE;
4660}
4661
4662static int handle_cr(struct kvm_vcpu *vcpu)
4663{
4664 unsigned long exit_qualification, val;
4665 int cr;
4666 int reg;
4667 int err;
4668 int ret;
4669
4670 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
4671 cr = exit_qualification & 15;
4672 reg = (exit_qualification >> 8) & 15;
4673 switch ((exit_qualification >> 4) & 3) {
4674 case 0: /* mov to cr */
4675 val = kvm_register_readl(vcpu, reg);
4676 trace_kvm_cr_write(cr, val);
4677 switch (cr) {
4678 case 0:
4679 err = handle_set_cr0(vcpu, val);
4680 return kvm_complete_insn_gp(vcpu, err);
4681 case 3:
4682 WARN_ON_ONCE(enable_unrestricted_guest);
4683 err = kvm_set_cr3(vcpu, val);
4684 return kvm_complete_insn_gp(vcpu, err);
4685 case 4:
4686 err = handle_set_cr4(vcpu, val);
4687 return kvm_complete_insn_gp(vcpu, err);
4688 case 8: {
4689 u8 cr8_prev = kvm_get_cr8(vcpu);
4690 u8 cr8 = (u8)val;
4691 err = kvm_set_cr8(vcpu, cr8);
4692 ret = kvm_complete_insn_gp(vcpu, err);
4693 if (lapic_in_kernel(vcpu))
4694 return ret;
4695 if (cr8_prev <= cr8)
4696 return ret;
4697 /*
4698 * TODO: we might be squashing a
4699 * KVM_GUESTDBG_SINGLESTEP-triggered
4700 * KVM_EXIT_DEBUG here.
4701 */
4702 vcpu->run->exit_reason = KVM_EXIT_SET_TPR;
4703 return 0;
4704 }
4705 }
4706 break;
4707 case 2: /* clts */
4708 WARN_ONCE(1, "Guest should always own CR0.TS");
4709 vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
4710 trace_kvm_cr_write(0, kvm_read_cr0(vcpu));
4711 return kvm_skip_emulated_instruction(vcpu);
4712 case 1: /*mov from cr*/
4713 switch (cr) {
4714 case 3:
4715 WARN_ON_ONCE(enable_unrestricted_guest);
4716 val = kvm_read_cr3(vcpu);
4717 kvm_register_write(vcpu, reg, val);
4718 trace_kvm_cr_read(cr, val);
4719 return kvm_skip_emulated_instruction(vcpu);
4720 case 8:
4721 val = kvm_get_cr8(vcpu);
4722 kvm_register_write(vcpu, reg, val);
4723 trace_kvm_cr_read(cr, val);
4724 return kvm_skip_emulated_instruction(vcpu);
4725 }
4726 break;
4727 case 3: /* lmsw */
4728 val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;
4729 trace_kvm_cr_write(0, (kvm_read_cr0(vcpu) & ~0xful) | val);
4730 kvm_lmsw(vcpu, val);
4731
4732 return kvm_skip_emulated_instruction(vcpu);
4733 default:
4734 break;
4735 }
4736 vcpu->run->exit_reason = 0;
4737 vcpu_unimpl(vcpu, "unhandled control register: op %d cr %d\n",
4738 (int)(exit_qualification >> 4) & 3, cr);
4739 return 0;
4740}
4741
4742static int handle_dr(struct kvm_vcpu *vcpu)
4743{
4744 unsigned long exit_qualification;
4745 int dr, dr7, reg;
4746
4747 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
4748 dr = exit_qualification & DEBUG_REG_ACCESS_NUM;
4749
4750 /* First, if DR does not exist, trigger UD */
4751 if (!kvm_require_dr(vcpu, dr))
4752 return 1;
4753
4754 /* Do not handle if the CPL > 0, will trigger GP on re-entry */
4755 if (!kvm_require_cpl(vcpu, 0))
4756 return 1;
4757 dr7 = vmcs_readl(GUEST_DR7);
4758 if (dr7 & DR7_GD) {
4759 /*
4760 * As the vm-exit takes precedence over the debug trap, we
4761 * need to emulate the latter, either for the host or the
4762 * guest debugging itself.
4763 */
4764 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
4765 vcpu->run->debug.arch.dr6 = vcpu->arch.dr6;
4766 vcpu->run->debug.arch.dr7 = dr7;
4767 vcpu->run->debug.arch.pc = kvm_get_linear_rip(vcpu);
4768 vcpu->run->debug.arch.exception = DB_VECTOR;
4769 vcpu->run->exit_reason = KVM_EXIT_DEBUG;
4770 return 0;
4771 } else {
4772 vcpu->arch.dr6 &= ~15;
4773 vcpu->arch.dr6 |= DR6_BD | DR6_RTM;
4774 kvm_queue_exception(vcpu, DB_VECTOR);
4775 return 1;
4776 }
4777 }
4778
4779 if (vcpu->guest_debug == 0) {
4780 vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
4781 CPU_BASED_MOV_DR_EXITING);
4782
4783 /*
4784 * No more DR vmexits; force a reload of the debug registers
4785 * and reenter on this instruction. The next vmexit will
4786 * retrieve the full state of the debug registers.
4787 */
4788 vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
4789 return 1;
4790 }
4791
4792 reg = DEBUG_REG_ACCESS_REG(exit_qualification);
4793 if (exit_qualification & TYPE_MOV_FROM_DR) {
4794 unsigned long val;
4795
4796 if (kvm_get_dr(vcpu, dr, &val))
4797 return 1;
4798 kvm_register_write(vcpu, reg, val);
4799 } else
4800 if (kvm_set_dr(vcpu, dr, kvm_register_readl(vcpu, reg)))
4801 return 1;
4802
4803 return kvm_skip_emulated_instruction(vcpu);
4804}
4805
4806static u64 vmx_get_dr6(struct kvm_vcpu *vcpu)
4807{
4808 return vcpu->arch.dr6;
4809}
4810
4811static void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val)
4812{
4813}
4814
4815static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
4816{
4817 get_debugreg(vcpu->arch.db[0], 0);
4818 get_debugreg(vcpu->arch.db[1], 1);
4819 get_debugreg(vcpu->arch.db[2], 2);
4820 get_debugreg(vcpu->arch.db[3], 3);
4821 get_debugreg(vcpu->arch.dr6, 6);
4822 vcpu->arch.dr7 = vmcs_readl(GUEST_DR7);
4823
4824 vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
4825 vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL, CPU_BASED_MOV_DR_EXITING);
4826}
4827
4828static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
4829{
4830 vmcs_writel(GUEST_DR7, val);
4831}
4832
4833static int handle_cpuid(struct kvm_vcpu *vcpu)
4834{
4835 return kvm_emulate_cpuid(vcpu);
4836}
4837
4838static int handle_rdmsr(struct kvm_vcpu *vcpu)
4839{
4840 u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
4841 struct msr_data msr_info;
4842
4843 msr_info.index = ecx;
4844 msr_info.host_initiated = false;
4845 if (vmx_get_msr(vcpu, &msr_info)) {
4846 trace_kvm_msr_read_ex(ecx);
4847 kvm_inject_gp(vcpu, 0);
4848 return 1;
4849 }
4850
4851 trace_kvm_msr_read(ecx, msr_info.data);
4852
4853 /* FIXME: handling of bits 32:63 of rax, rdx */
4854 vcpu->arch.regs[VCPU_REGS_RAX] = msr_info.data & -1u;
4855 vcpu->arch.regs[VCPU_REGS_RDX] = (msr_info.data >> 32) & -1u;
4856 return kvm_skip_emulated_instruction(vcpu);
4857}
4858
4859static int handle_wrmsr(struct kvm_vcpu *vcpu)
4860{
4861 struct msr_data msr;
4862 u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
4863 u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u)
4864 | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32);
4865
4866 msr.data = data;
4867 msr.index = ecx;
4868 msr.host_initiated = false;
4869 if (kvm_set_msr(vcpu, &msr) != 0) {
4870 trace_kvm_msr_write_ex(ecx, data);
4871 kvm_inject_gp(vcpu, 0);
4872 return 1;
4873 }
4874
4875 trace_kvm_msr_write(ecx, data);
4876 return kvm_skip_emulated_instruction(vcpu);
4877}
4878
4879static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu)
4880{
4881 kvm_apic_update_ppr(vcpu);
4882 return 1;
4883}
4884
4885static int handle_interrupt_window(struct kvm_vcpu *vcpu)
4886{
4887 vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
4888 CPU_BASED_VIRTUAL_INTR_PENDING);
4889
4890 kvm_make_request(KVM_REQ_EVENT, vcpu);
4891
4892 ++vcpu->stat.irq_window_exits;
4893 return 1;
4894}
4895
4896static int handle_halt(struct kvm_vcpu *vcpu)
4897{
4898 return kvm_emulate_halt(vcpu);
4899}
4900
4901static int handle_vmcall(struct kvm_vcpu *vcpu)
4902{
4903 return kvm_emulate_hypercall(vcpu);
4904}
4905
4906static int handle_invd(struct kvm_vcpu *vcpu)
4907{
4908 return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE;
4909}
4910
4911static int handle_invlpg(struct kvm_vcpu *vcpu)
4912{
4913 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
4914
4915 kvm_mmu_invlpg(vcpu, exit_qualification);
4916 return kvm_skip_emulated_instruction(vcpu);
4917}
4918
4919static int handle_rdpmc(struct kvm_vcpu *vcpu)
4920{
4921 int err;
4922
4923 err = kvm_rdpmc(vcpu);
4924 return kvm_complete_insn_gp(vcpu, err);
4925}
4926
4927static int handle_wbinvd(struct kvm_vcpu *vcpu)
4928{
4929 return kvm_emulate_wbinvd(vcpu);
4930}
4931
4932static int handle_xsetbv(struct kvm_vcpu *vcpu)
4933{
4934 u64 new_bv = kvm_read_edx_eax(vcpu);
4935 u32 index = kvm_register_read(vcpu, VCPU_REGS_RCX);
4936
4937 if (kvm_set_xcr(vcpu, index, new_bv) == 0)
4938 return kvm_skip_emulated_instruction(vcpu);
4939 return 1;
4940}
4941
4942static int handle_xsaves(struct kvm_vcpu *vcpu)
4943{
4944 kvm_skip_emulated_instruction(vcpu);
4945 WARN(1, "this should never happen\n");
4946 return 1;
4947}
4948
4949static int handle_xrstors(struct kvm_vcpu *vcpu)
4950{
4951 kvm_skip_emulated_instruction(vcpu);
4952 WARN(1, "this should never happen\n");
4953 return 1;
4954}
4955
4956static int handle_apic_access(struct kvm_vcpu *vcpu)
4957{
4958 if (likely(fasteoi)) {
4959 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
4960 int access_type, offset;
4961
4962 access_type = exit_qualification & APIC_ACCESS_TYPE;
4963 offset = exit_qualification & APIC_ACCESS_OFFSET;
4964 /*
4965 * Sane guest uses MOV to write EOI, with written value
4966 * not cared. So make a short-circuit here by avoiding
4967 * heavy instruction emulation.
4968 */
4969 if ((access_type == TYPE_LINEAR_APIC_INST_WRITE) &&
4970 (offset == APIC_EOI)) {
4971 kvm_lapic_set_eoi(vcpu);
4972 return kvm_skip_emulated_instruction(vcpu);
4973 }
4974 }
4975 return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE;
4976}
4977
4978static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu)
4979{
4980 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
4981 int vector = exit_qualification & 0xff;
4982
4983 /* EOI-induced VM exit is trap-like and thus no need to adjust IP */
4984 kvm_apic_set_eoi_accelerated(vcpu, vector);
4985 return 1;
4986}
4987
4988static int handle_apic_write(struct kvm_vcpu *vcpu)
4989{
4990 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
4991 u32 offset = exit_qualification & 0xfff;
4992
4993 /* APIC-write VM exit is trap-like and thus no need to adjust IP */
4994 kvm_apic_write_nodecode(vcpu, offset);
4995 return 1;
4996}
4997
4998static int handle_task_switch(struct kvm_vcpu *vcpu)
4999{
5000 struct vcpu_vmx *vmx = to_vmx(vcpu);
5001 unsigned long exit_qualification;
5002 bool has_error_code = false;
5003 u32 error_code = 0;
5004 u16 tss_selector;
5005 int reason, type, idt_v, idt_index;
5006
5007 idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
5008 idt_index = (vmx->idt_vectoring_info & VECTORING_INFO_VECTOR_MASK);
5009 type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK);
5010
5011 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
5012
5013 reason = (u32)exit_qualification >> 30;
5014 if (reason == TASK_SWITCH_GATE && idt_v) {
5015 switch (type) {
5016 case INTR_TYPE_NMI_INTR:
5017 vcpu->arch.nmi_injected = false;
5018 vmx_set_nmi_mask(vcpu, true);
5019 break;
5020 case INTR_TYPE_EXT_INTR:
5021 case INTR_TYPE_SOFT_INTR:
5022 kvm_clear_interrupt_queue(vcpu);
5023 break;
5024 case INTR_TYPE_HARD_EXCEPTION:
5025 if (vmx->idt_vectoring_info &
5026 VECTORING_INFO_DELIVER_CODE_MASK) {
5027 has_error_code = true;
5028 error_code =
5029 vmcs_read32(IDT_VECTORING_ERROR_CODE);
5030 }
5031 /* fall through */
5032 case INTR_TYPE_SOFT_EXCEPTION:
5033 kvm_clear_exception_queue(vcpu);
5034 break;
5035 default:
5036 break;
5037 }
5038 }
5039 tss_selector = exit_qualification;
5040
5041 if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION &&
5042 type != INTR_TYPE_EXT_INTR &&
5043 type != INTR_TYPE_NMI_INTR))
5044 skip_emulated_instruction(vcpu);
5045
5046 if (kvm_task_switch(vcpu, tss_selector,
5047 type == INTR_TYPE_SOFT_INTR ? idt_index : -1, reason,
5048 has_error_code, error_code) == EMULATE_FAIL) {
5049 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
5050 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
5051 vcpu->run->internal.ndata = 0;
5052 return 0;
5053 }
5054
5055 /*
5056 * TODO: What about debug traps on tss switch?
5057 * Are we supposed to inject them and update dr6?
5058 */
5059
5060 return 1;
5061}
5062
5063static int handle_ept_violation(struct kvm_vcpu *vcpu)
5064{
5065 unsigned long exit_qualification;
5066 gpa_t gpa;
5067 u64 error_code;
5068
5069 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
5070
5071 /*
5072 * EPT violation happened while executing iret from NMI,
5073 * "blocked by NMI" bit has to be set before next VM entry.
5074 * There are errata that may cause this bit to not be set:
5075 * AAK134, BY25.
5076 */
5077 if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
5078 enable_vnmi &&
5079 (exit_qualification & INTR_INFO_UNBLOCK_NMI))
5080 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI);
5081
5082 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
5083 trace_kvm_page_fault(gpa, exit_qualification);
5084
5085 /* Is it a read fault? */
5086 error_code = (exit_qualification & EPT_VIOLATION_ACC_READ)
5087 ? PFERR_USER_MASK : 0;
5088 /* Is it a write fault? */
5089 error_code |= (exit_qualification & EPT_VIOLATION_ACC_WRITE)
5090 ? PFERR_WRITE_MASK : 0;
5091 /* Is it a fetch fault? */
5092 error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR)
5093 ? PFERR_FETCH_MASK : 0;
5094 /* ept page table entry is present? */
5095 error_code |= (exit_qualification &
5096 (EPT_VIOLATION_READABLE | EPT_VIOLATION_WRITABLE |
5097 EPT_VIOLATION_EXECUTABLE))
5098 ? PFERR_PRESENT_MASK : 0;
5099
5100 error_code |= (exit_qualification & 0x100) != 0 ?
5101 PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK;
5102
5103 vcpu->arch.exit_qualification = exit_qualification;
5104 return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
5105}
5106
5107static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
5108{
5109 gpa_t gpa;
5110
5111 /*
5112 * A nested guest cannot optimize MMIO vmexits, because we have an
5113 * nGPA here instead of the required GPA.
5114 */
5115 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
5116 if (!is_guest_mode(vcpu) &&
5117 !kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
5118 trace_kvm_fast_mmio(gpa);
5119 /*
5120 * Doing kvm_skip_emulated_instruction() depends on undefined
5121 * behavior: Intel's manual doesn't mandate
5122 * VM_EXIT_INSTRUCTION_LEN to be set in VMCS when EPT MISCONFIG
5123 * occurs and while on real hardware it was observed to be set,
5124 * other hypervisors (namely Hyper-V) don't set it, we end up
5125 * advancing IP with some random value. Disable fast mmio when
5126 * running nested and keep it for real hardware in hope that
5127 * VM_EXIT_INSTRUCTION_LEN will always be set correctly.
5128 */
5129 if (!static_cpu_has(X86_FEATURE_HYPERVISOR))
5130 return kvm_skip_emulated_instruction(vcpu);
5131 else
5132 return kvm_emulate_instruction(vcpu, EMULTYPE_SKIP) ==
5133 EMULATE_DONE;
5134 }
5135
5136 return kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0);
5137}
5138
5139static int handle_nmi_window(struct kvm_vcpu *vcpu)
5140{
5141 WARN_ON_ONCE(!enable_vnmi);
5142 vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
5143 CPU_BASED_VIRTUAL_NMI_PENDING);
5144 ++vcpu->stat.nmi_window_exits;
5145 kvm_make_request(KVM_REQ_EVENT, vcpu);
5146
5147 return 1;
5148}
5149
5150static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
5151{
5152 struct vcpu_vmx *vmx = to_vmx(vcpu);
5153 enum emulation_result err = EMULATE_DONE;
5154 int ret = 1;
5155 u32 cpu_exec_ctrl;
5156 bool intr_window_requested;
5157 unsigned count = 130;
5158
5159 /*
5160 * We should never reach the point where we are emulating L2
5161 * due to invalid guest state as that means we incorrectly
5162 * allowed a nested VMEntry with an invalid vmcs12.
5163 */
5164 WARN_ON_ONCE(vmx->emulation_required && vmx->nested.nested_run_pending);
5165
5166 cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
5167 intr_window_requested = cpu_exec_ctrl & CPU_BASED_VIRTUAL_INTR_PENDING;
5168
5169 while (vmx->emulation_required && count-- != 0) {
5170 if (intr_window_requested && vmx_interrupt_allowed(vcpu))
5171 return handle_interrupt_window(&vmx->vcpu);
5172
5173 if (kvm_test_request(KVM_REQ_EVENT, vcpu))
5174 return 1;
5175
5176 err = kvm_emulate_instruction(vcpu, 0);
5177
5178 if (err == EMULATE_USER_EXIT) {
5179 ++vcpu->stat.mmio_exits;
5180 ret = 0;
5181 goto out;
5182 }
5183
5184 if (err != EMULATE_DONE)
5185 goto emulation_error;
5186
5187 if (vmx->emulation_required && !vmx->rmode.vm86_active &&
5188 vcpu->arch.exception.pending)
5189 goto emulation_error;
5190
5191 if (vcpu->arch.halt_request) {
5192 vcpu->arch.halt_request = 0;
5193 ret = kvm_vcpu_halt(vcpu);
5194 goto out;
5195 }
5196
5197 if (signal_pending(current))
5198 goto out;
5199 if (need_resched())
5200 schedule();
5201 }
5202
5203out:
5204 return ret;
5205
5206emulation_error:
5207 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
5208 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
5209 vcpu->run->internal.ndata = 0;
5210 return 0;
5211}
5212
5213static void grow_ple_window(struct kvm_vcpu *vcpu)
5214{
5215 struct vcpu_vmx *vmx = to_vmx(vcpu);
5216 int old = vmx->ple_window;
5217
5218 vmx->ple_window = __grow_ple_window(old, ple_window,
5219 ple_window_grow,
5220 ple_window_max);
5221
5222 if (vmx->ple_window != old)
5223 vmx->ple_window_dirty = true;
5224
5225 trace_kvm_ple_window_grow(vcpu->vcpu_id, vmx->ple_window, old);
5226}
5227
5228static void shrink_ple_window(struct kvm_vcpu *vcpu)
5229{
5230 struct vcpu_vmx *vmx = to_vmx(vcpu);
5231 int old = vmx->ple_window;
5232
5233 vmx->ple_window = __shrink_ple_window(old, ple_window,
5234 ple_window_shrink,
5235 ple_window);
5236
5237 if (vmx->ple_window != old)
5238 vmx->ple_window_dirty = true;
5239
5240 trace_kvm_ple_window_shrink(vcpu->vcpu_id, vmx->ple_window, old);
5241}
5242
5243/*
5244 * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR.
5245 */
5246static void wakeup_handler(void)
5247{
5248 struct kvm_vcpu *vcpu;
5249 int cpu = smp_processor_id();
5250
5251 spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
5252 list_for_each_entry(vcpu, &per_cpu(blocked_vcpu_on_cpu, cpu),
5253 blocked_vcpu_list) {
5254 struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
5255
5256 if (pi_test_on(pi_desc) == 1)
5257 kvm_vcpu_kick(vcpu);
5258 }
5259 spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
5260}
5261
5262static void vmx_enable_tdp(void)
5263{
5264 kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK,
5265 enable_ept_ad_bits ? VMX_EPT_ACCESS_BIT : 0ull,
5266 enable_ept_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull,
5267 0ull, VMX_EPT_EXECUTABLE_MASK,
5268 cpu_has_vmx_ept_execute_only() ? 0ull : VMX_EPT_READABLE_MASK,
5269 VMX_EPT_RWX_MASK, 0ull);
5270
5271 ept_set_mmio_spte_mask();
5272 kvm_enable_tdp();
5273}
5274
5275/*
5276 * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE
5277 * exiting, so only get here on cpu with PAUSE-Loop-Exiting.
5278 */
5279static int handle_pause(struct kvm_vcpu *vcpu)
5280{
5281 if (!kvm_pause_in_guest(vcpu->kvm))
5282 grow_ple_window(vcpu);
5283
5284 /*
5285 * Intel sdm vol3 ch-25.1.3 says: The "PAUSE-loop exiting"
5286 * VM-execution control is ignored if CPL > 0. OTOH, KVM
5287 * never set PAUSE_EXITING and just set PLE if supported,
5288 * so the vcpu must be CPL=0 if it gets a PAUSE exit.
5289 */
5290 kvm_vcpu_on_spin(vcpu, true);
5291 return kvm_skip_emulated_instruction(vcpu);
5292}
5293
5294static int handle_nop(struct kvm_vcpu *vcpu)
5295{
5296 return kvm_skip_emulated_instruction(vcpu);
5297}
5298
5299static int handle_mwait(struct kvm_vcpu *vcpu)
5300{
5301 printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n");
5302 return handle_nop(vcpu);
5303}
5304
5305static int handle_invalid_op(struct kvm_vcpu *vcpu)
5306{
5307 kvm_queue_exception(vcpu, UD_VECTOR);
5308 return 1;
5309}
5310
5311static int handle_monitor_trap(struct kvm_vcpu *vcpu)
5312{
5313 return 1;
5314}
5315
5316static int handle_monitor(struct kvm_vcpu *vcpu)
5317{
5318 printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n");
5319 return handle_nop(vcpu);
5320}
5321
5322static int handle_invpcid(struct kvm_vcpu *vcpu)
5323{
5324 u32 vmx_instruction_info;
5325 unsigned long type;
5326 bool pcid_enabled;
5327 gva_t gva;
5328 struct x86_exception e;
5329 unsigned i;
5330 unsigned long roots_to_free = 0;
5331 struct {
5332 u64 pcid;
5333 u64 gla;
5334 } operand;
5335
5336 if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) {
5337 kvm_queue_exception(vcpu, UD_VECTOR);
5338 return 1;
5339 }
5340
5341 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5342 type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
5343
5344 if (type > 3) {
5345 kvm_inject_gp(vcpu, 0);
5346 return 1;
5347 }
5348
5349 /* According to the Intel instruction reference, the memory operand
5350 * is read even if it isn't needed (e.g., for type==all)
5351 */
5352 if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
5353 vmx_instruction_info, false, &gva))
5354 return 1;
5355
5356 if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) {
5357 kvm_inject_page_fault(vcpu, &e);
5358 return 1;
5359 }
5360
5361 if (operand.pcid >> 12 != 0) {
5362 kvm_inject_gp(vcpu, 0);
5363 return 1;
5364 }
5365
5366 pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE);
5367
5368 switch (type) {
5369 case INVPCID_TYPE_INDIV_ADDR:
5370 if ((!pcid_enabled && (operand.pcid != 0)) ||
5371 is_noncanonical_address(operand.gla, vcpu)) {
5372 kvm_inject_gp(vcpu, 0);
5373 return 1;
5374 }
5375 kvm_mmu_invpcid_gva(vcpu, operand.gla, operand.pcid);
5376 return kvm_skip_emulated_instruction(vcpu);
5377
5378 case INVPCID_TYPE_SINGLE_CTXT:
5379 if (!pcid_enabled && (operand.pcid != 0)) {
5380 kvm_inject_gp(vcpu, 0);
5381 return 1;
5382 }
5383
5384 if (kvm_get_active_pcid(vcpu) == operand.pcid) {
5385 kvm_mmu_sync_roots(vcpu);
5386 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
5387 }
5388
5389 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
5390 if (kvm_get_pcid(vcpu, vcpu->arch.mmu->prev_roots[i].cr3)
5391 == operand.pcid)
5392 roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
5393
5394 kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, roots_to_free);
5395 /*
5396 * If neither the current cr3 nor any of the prev_roots use the
5397 * given PCID, then nothing needs to be done here because a
5398 * resync will happen anyway before switching to any other CR3.
5399 */
5400
5401 return kvm_skip_emulated_instruction(vcpu);
5402
5403 case INVPCID_TYPE_ALL_NON_GLOBAL:
5404 /*
5405 * Currently, KVM doesn't mark global entries in the shadow
5406 * page tables, so a non-global flush just degenerates to a
5407 * global flush. If needed, we could optimize this later by
5408 * keeping track of global entries in shadow page tables.
5409 */
5410
5411 /* fall-through */
5412 case INVPCID_TYPE_ALL_INCL_GLOBAL:
5413 kvm_mmu_unload(vcpu);
5414 return kvm_skip_emulated_instruction(vcpu);
5415
5416 default:
5417 BUG(); /* We have already checked above that type <= 3 */
5418 }
5419}
5420
5421static int handle_pml_full(struct kvm_vcpu *vcpu)
5422{
5423 unsigned long exit_qualification;
5424
5425 trace_kvm_pml_full(vcpu->vcpu_id);
5426
5427 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
5428
5429 /*
5430 * PML buffer FULL happened while executing iret from NMI,
5431 * "blocked by NMI" bit has to be set before next VM entry.
5432 */
5433 if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
5434 enable_vnmi &&
5435 (exit_qualification & INTR_INFO_UNBLOCK_NMI))
5436 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
5437 GUEST_INTR_STATE_NMI);
5438
5439 /*
5440 * PML buffer already flushed at beginning of VMEXIT. Nothing to do
5441 * here.., and there's no userspace involvement needed for PML.
5442 */
5443 return 1;
5444}
5445
5446static int handle_preemption_timer(struct kvm_vcpu *vcpu)
5447{
5448 if (!to_vmx(vcpu)->req_immediate_exit)
5449 kvm_lapic_expired_hv_timer(vcpu);
5450 return 1;
5451}
5452
5453/*
5454 * When nested=0, all VMX instruction VM Exits filter here. The handlers
5455 * are overwritten by nested_vmx_setup() when nested=1.
5456 */
5457static int handle_vmx_instruction(struct kvm_vcpu *vcpu)
5458{
5459 kvm_queue_exception(vcpu, UD_VECTOR);
5460 return 1;
5461}
5462
5463static int handle_encls(struct kvm_vcpu *vcpu)
5464{
5465 /*
5466 * SGX virtualization is not yet supported. There is no software
5467 * enable bit for SGX, so we have to trap ENCLS and inject a #UD
5468 * to prevent the guest from executing ENCLS.
5469 */
5470 kvm_queue_exception(vcpu, UD_VECTOR);
5471 return 1;
5472}
5473
5474/*
5475 * The exit handlers return 1 if the exit was handled fully and guest execution
5476 * may resume. Otherwise they set the kvm_run parameter to indicate what needs
5477 * to be done to userspace and return 0.
5478 */
5479static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
5480 [EXIT_REASON_EXCEPTION_NMI] = handle_exception,
5481 [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt,
5482 [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault,
5483 [EXIT_REASON_NMI_WINDOW] = handle_nmi_window,
5484 [EXIT_REASON_IO_INSTRUCTION] = handle_io,
5485 [EXIT_REASON_CR_ACCESS] = handle_cr,
5486 [EXIT_REASON_DR_ACCESS] = handle_dr,
5487 [EXIT_REASON_CPUID] = handle_cpuid,
5488 [EXIT_REASON_MSR_READ] = handle_rdmsr,
5489 [EXIT_REASON_MSR_WRITE] = handle_wrmsr,
5490 [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window,
5491 [EXIT_REASON_HLT] = handle_halt,
5492 [EXIT_REASON_INVD] = handle_invd,
5493 [EXIT_REASON_INVLPG] = handle_invlpg,
5494 [EXIT_REASON_RDPMC] = handle_rdpmc,
5495 [EXIT_REASON_VMCALL] = handle_vmcall,
5496 [EXIT_REASON_VMCLEAR] = handle_vmx_instruction,
5497 [EXIT_REASON_VMLAUNCH] = handle_vmx_instruction,
5498 [EXIT_REASON_VMPTRLD] = handle_vmx_instruction,
5499 [EXIT_REASON_VMPTRST] = handle_vmx_instruction,
5500 [EXIT_REASON_VMREAD] = handle_vmx_instruction,
5501 [EXIT_REASON_VMRESUME] = handle_vmx_instruction,
5502 [EXIT_REASON_VMWRITE] = handle_vmx_instruction,
5503 [EXIT_REASON_VMOFF] = handle_vmx_instruction,
5504 [EXIT_REASON_VMON] = handle_vmx_instruction,
5505 [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold,
5506 [EXIT_REASON_APIC_ACCESS] = handle_apic_access,
5507 [EXIT_REASON_APIC_WRITE] = handle_apic_write,
5508 [EXIT_REASON_EOI_INDUCED] = handle_apic_eoi_induced,
5509 [EXIT_REASON_WBINVD] = handle_wbinvd,
5510 [EXIT_REASON_XSETBV] = handle_xsetbv,
5511 [EXIT_REASON_TASK_SWITCH] = handle_task_switch,
5512 [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check,
5513 [EXIT_REASON_GDTR_IDTR] = handle_desc,
5514 [EXIT_REASON_LDTR_TR] = handle_desc,
5515 [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation,
5516 [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig,
5517 [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause,
5518 [EXIT_REASON_MWAIT_INSTRUCTION] = handle_mwait,
5519 [EXIT_REASON_MONITOR_TRAP_FLAG] = handle_monitor_trap,
5520 [EXIT_REASON_MONITOR_INSTRUCTION] = handle_monitor,
5521 [EXIT_REASON_INVEPT] = handle_vmx_instruction,
5522 [EXIT_REASON_INVVPID] = handle_vmx_instruction,
5523 [EXIT_REASON_RDRAND] = handle_invalid_op,
5524 [EXIT_REASON_RDSEED] = handle_invalid_op,
5525 [EXIT_REASON_XSAVES] = handle_xsaves,
5526 [EXIT_REASON_XRSTORS] = handle_xrstors,
5527 [EXIT_REASON_PML_FULL] = handle_pml_full,
5528 [EXIT_REASON_INVPCID] = handle_invpcid,
5529 [EXIT_REASON_VMFUNC] = handle_vmx_instruction,
5530 [EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer,
5531 [EXIT_REASON_ENCLS] = handle_encls,
5532};
5533
5534static const int kvm_vmx_max_exit_handlers =
5535 ARRAY_SIZE(kvm_vmx_exit_handlers);
5536
5537static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
5538{
5539 *info1 = vmcs_readl(EXIT_QUALIFICATION);
5540 *info2 = vmcs_read32(VM_EXIT_INTR_INFO);
5541}
5542
5543static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx)
5544{
5545 if (vmx->pml_pg) {
5546 __free_page(vmx->pml_pg);
5547 vmx->pml_pg = NULL;
5548 }
5549}
5550
5551static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu)
5552{
5553 struct vcpu_vmx *vmx = to_vmx(vcpu);
5554 u64 *pml_buf;
5555 u16 pml_idx;
5556
5557 pml_idx = vmcs_read16(GUEST_PML_INDEX);
5558
5559 /* Do nothing if PML buffer is empty */
5560 if (pml_idx == (PML_ENTITY_NUM - 1))
5561 return;
5562
5563 /* PML index always points to next available PML buffer entity */
5564 if (pml_idx >= PML_ENTITY_NUM)
5565 pml_idx = 0;
5566 else
5567 pml_idx++;
5568
5569 pml_buf = page_address(vmx->pml_pg);
5570 for (; pml_idx < PML_ENTITY_NUM; pml_idx++) {
5571 u64 gpa;
5572
5573 gpa = pml_buf[pml_idx];
5574 WARN_ON(gpa & (PAGE_SIZE - 1));
5575 kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
5576 }
5577
5578 /* reset PML index */
5579 vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
5580}
5581
5582/*
5583 * Flush all vcpus' PML buffer and update logged GPAs to dirty_bitmap.
5584 * Called before reporting dirty_bitmap to userspace.
5585 */
5586static void kvm_flush_pml_buffers(struct kvm *kvm)
5587{
5588 int i;
5589 struct kvm_vcpu *vcpu;
5590 /*
5591 * We only need to kick vcpu out of guest mode here, as PML buffer
5592 * is flushed at beginning of all VMEXITs, and it's obvious that only
5593 * vcpus running in guest are possible to have unflushed GPAs in PML
5594 * buffer.
5595 */
5596 kvm_for_each_vcpu(i, vcpu, kvm)
5597 kvm_vcpu_kick(vcpu);
5598}
5599
5600static void vmx_dump_sel(char *name, uint32_t sel)
5601{
5602 pr_err("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016lx\n",
5603 name, vmcs_read16(sel),
5604 vmcs_read32(sel + GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR),
5605 vmcs_read32(sel + GUEST_ES_LIMIT - GUEST_ES_SELECTOR),
5606 vmcs_readl(sel + GUEST_ES_BASE - GUEST_ES_SELECTOR));
5607}
5608
5609static void vmx_dump_dtsel(char *name, uint32_t limit)
5610{
5611 pr_err("%s limit=0x%08x, base=0x%016lx\n",
5612 name, vmcs_read32(limit),
5613 vmcs_readl(limit + GUEST_GDTR_BASE - GUEST_GDTR_LIMIT));
5614}
5615
5616static void dump_vmcs(void)
5617{
5618 u32 vmentry_ctl = vmcs_read32(VM_ENTRY_CONTROLS);
5619 u32 vmexit_ctl = vmcs_read32(VM_EXIT_CONTROLS);
5620 u32 cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
5621 u32 pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL);
5622 u32 secondary_exec_control = 0;
5623 unsigned long cr4 = vmcs_readl(GUEST_CR4);
5624 u64 efer = vmcs_read64(GUEST_IA32_EFER);
5625 int i, n;
5626
5627 if (cpu_has_secondary_exec_ctrls())
5628 secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
5629
5630 pr_err("*** Guest State ***\n");
5631 pr_err("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
5632 vmcs_readl(GUEST_CR0), vmcs_readl(CR0_READ_SHADOW),
5633 vmcs_readl(CR0_GUEST_HOST_MASK));
5634 pr_err("CR4: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
5635 cr4, vmcs_readl(CR4_READ_SHADOW), vmcs_readl(CR4_GUEST_HOST_MASK));
5636 pr_err("CR3 = 0x%016lx\n", vmcs_readl(GUEST_CR3));
5637 if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT) &&
5638 (cr4 & X86_CR4_PAE) && !(efer & EFER_LMA))
5639 {
5640 pr_err("PDPTR0 = 0x%016llx PDPTR1 = 0x%016llx\n",
5641 vmcs_read64(GUEST_PDPTR0), vmcs_read64(GUEST_PDPTR1));
5642 pr_err("PDPTR2 = 0x%016llx PDPTR3 = 0x%016llx\n",
5643 vmcs_read64(GUEST_PDPTR2), vmcs_read64(GUEST_PDPTR3));
5644 }
5645 pr_err("RSP = 0x%016lx RIP = 0x%016lx\n",
5646 vmcs_readl(GUEST_RSP), vmcs_readl(GUEST_RIP));
5647 pr_err("RFLAGS=0x%08lx DR7 = 0x%016lx\n",
5648 vmcs_readl(GUEST_RFLAGS), vmcs_readl(GUEST_DR7));
5649 pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n",
5650 vmcs_readl(GUEST_SYSENTER_ESP),
5651 vmcs_read32(GUEST_SYSENTER_CS), vmcs_readl(GUEST_SYSENTER_EIP));
5652 vmx_dump_sel("CS: ", GUEST_CS_SELECTOR);
5653 vmx_dump_sel("DS: ", GUEST_DS_SELECTOR);
5654 vmx_dump_sel("SS: ", GUEST_SS_SELECTOR);
5655 vmx_dump_sel("ES: ", GUEST_ES_SELECTOR);
5656 vmx_dump_sel("FS: ", GUEST_FS_SELECTOR);
5657 vmx_dump_sel("GS: ", GUEST_GS_SELECTOR);
5658 vmx_dump_dtsel("GDTR:", GUEST_GDTR_LIMIT);
5659 vmx_dump_sel("LDTR:", GUEST_LDTR_SELECTOR);
5660 vmx_dump_dtsel("IDTR:", GUEST_IDTR_LIMIT);
5661 vmx_dump_sel("TR: ", GUEST_TR_SELECTOR);
5662 if ((vmexit_ctl & (VM_EXIT_SAVE_IA32_PAT | VM_EXIT_SAVE_IA32_EFER)) ||
5663 (vmentry_ctl & (VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_IA32_EFER)))
5664 pr_err("EFER = 0x%016llx PAT = 0x%016llx\n",
5665 efer, vmcs_read64(GUEST_IA32_PAT));
5666 pr_err("DebugCtl = 0x%016llx DebugExceptions = 0x%016lx\n",
5667 vmcs_read64(GUEST_IA32_DEBUGCTL),
5668 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS));
5669 if (cpu_has_load_perf_global_ctrl() &&
5670 vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
5671 pr_err("PerfGlobCtl = 0x%016llx\n",
5672 vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL));
5673 if (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS)
5674 pr_err("BndCfgS = 0x%016llx\n", vmcs_read64(GUEST_BNDCFGS));
5675 pr_err("Interruptibility = %08x ActivityState = %08x\n",
5676 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO),
5677 vmcs_read32(GUEST_ACTIVITY_STATE));
5678 if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
5679 pr_err("InterruptStatus = %04x\n",
5680 vmcs_read16(GUEST_INTR_STATUS));
5681
5682 pr_err("*** Host State ***\n");
5683 pr_err("RIP = 0x%016lx RSP = 0x%016lx\n",
5684 vmcs_readl(HOST_RIP), vmcs_readl(HOST_RSP));
5685 pr_err("CS=%04x SS=%04x DS=%04x ES=%04x FS=%04x GS=%04x TR=%04x\n",
5686 vmcs_read16(HOST_CS_SELECTOR), vmcs_read16(HOST_SS_SELECTOR),
5687 vmcs_read16(HOST_DS_SELECTOR), vmcs_read16(HOST_ES_SELECTOR),
5688 vmcs_read16(HOST_FS_SELECTOR), vmcs_read16(HOST_GS_SELECTOR),
5689 vmcs_read16(HOST_TR_SELECTOR));
5690 pr_err("FSBase=%016lx GSBase=%016lx TRBase=%016lx\n",
5691 vmcs_readl(HOST_FS_BASE), vmcs_readl(HOST_GS_BASE),
5692 vmcs_readl(HOST_TR_BASE));
5693 pr_err("GDTBase=%016lx IDTBase=%016lx\n",
5694 vmcs_readl(HOST_GDTR_BASE), vmcs_readl(HOST_IDTR_BASE));
5695 pr_err("CR0=%016lx CR3=%016lx CR4=%016lx\n",
5696 vmcs_readl(HOST_CR0), vmcs_readl(HOST_CR3),
5697 vmcs_readl(HOST_CR4));
5698 pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n",
5699 vmcs_readl(HOST_IA32_SYSENTER_ESP),
5700 vmcs_read32(HOST_IA32_SYSENTER_CS),
5701 vmcs_readl(HOST_IA32_SYSENTER_EIP));
5702 if (vmexit_ctl & (VM_EXIT_LOAD_IA32_PAT | VM_EXIT_LOAD_IA32_EFER))
5703 pr_err("EFER = 0x%016llx PAT = 0x%016llx\n",
5704 vmcs_read64(HOST_IA32_EFER),
5705 vmcs_read64(HOST_IA32_PAT));
5706 if (cpu_has_load_perf_global_ctrl() &&
5707 vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
5708 pr_err("PerfGlobCtl = 0x%016llx\n",
5709 vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL));
5710
5711 pr_err("*** Control State ***\n");
5712 pr_err("PinBased=%08x CPUBased=%08x SecondaryExec=%08x\n",
5713 pin_based_exec_ctrl, cpu_based_exec_ctrl, secondary_exec_control);
5714 pr_err("EntryControls=%08x ExitControls=%08x\n", vmentry_ctl, vmexit_ctl);
5715 pr_err("ExceptionBitmap=%08x PFECmask=%08x PFECmatch=%08x\n",
5716 vmcs_read32(EXCEPTION_BITMAP),
5717 vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK),
5718 vmcs_read32(PAGE_FAULT_ERROR_CODE_MATCH));
5719 pr_err("VMEntry: intr_info=%08x errcode=%08x ilen=%08x\n",
5720 vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
5721 vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE),
5722 vmcs_read32(VM_ENTRY_INSTRUCTION_LEN));
5723 pr_err("VMExit: intr_info=%08x errcode=%08x ilen=%08x\n",
5724 vmcs_read32(VM_EXIT_INTR_INFO),
5725 vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
5726 vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
5727 pr_err(" reason=%08x qualification=%016lx\n",
5728 vmcs_read32(VM_EXIT_REASON), vmcs_readl(EXIT_QUALIFICATION));
5729 pr_err("IDTVectoring: info=%08x errcode=%08x\n",
5730 vmcs_read32(IDT_VECTORING_INFO_FIELD),
5731 vmcs_read32(IDT_VECTORING_ERROR_CODE));
5732 pr_err("TSC Offset = 0x%016llx\n", vmcs_read64(TSC_OFFSET));
5733 if (secondary_exec_control & SECONDARY_EXEC_TSC_SCALING)
5734 pr_err("TSC Multiplier = 0x%016llx\n",
5735 vmcs_read64(TSC_MULTIPLIER));
5736 if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW)
5737 pr_err("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD));
5738 if (pin_based_exec_ctrl & PIN_BASED_POSTED_INTR)
5739 pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV));
5740 if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT))
5741 pr_err("EPT pointer = 0x%016llx\n", vmcs_read64(EPT_POINTER));
5742 n = vmcs_read32(CR3_TARGET_COUNT);
5743 for (i = 0; i + 1 < n; i += 4)
5744 pr_err("CR3 target%u=%016lx target%u=%016lx\n",
5745 i, vmcs_readl(CR3_TARGET_VALUE0 + i * 2),
5746 i + 1, vmcs_readl(CR3_TARGET_VALUE0 + i * 2 + 2));
5747 if (i < n)
5748 pr_err("CR3 target%u=%016lx\n",
5749 i, vmcs_readl(CR3_TARGET_VALUE0 + i * 2));
5750 if (secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING)
5751 pr_err("PLE Gap=%08x Window=%08x\n",
5752 vmcs_read32(PLE_GAP), vmcs_read32(PLE_WINDOW));
5753 if (secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID)
5754 pr_err("Virtual processor ID = 0x%04x\n",
5755 vmcs_read16(VIRTUAL_PROCESSOR_ID));
5756}
5757
5758/*
5759 * The guest has exited. See if we can fix it or if we need userspace
5760 * assistance.
5761 */
5762static int vmx_handle_exit(struct kvm_vcpu *vcpu)
5763{
5764 struct vcpu_vmx *vmx = to_vmx(vcpu);
5765 u32 exit_reason = vmx->exit_reason;
5766 u32 vectoring_info = vmx->idt_vectoring_info;
5767
5768 trace_kvm_exit(exit_reason, vcpu, KVM_ISA_VMX);
5769
5770 /*
5771 * Flush logged GPAs PML buffer, this will make dirty_bitmap more
5772 * updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before
5773 * querying dirty_bitmap, we only need to kick all vcpus out of guest
5774 * mode as if vcpus is in root mode, the PML buffer must has been
5775 * flushed already.
5776 */
5777 if (enable_pml)
5778 vmx_flush_pml_buffer(vcpu);
5779
5780 /* If guest state is invalid, start emulating */
5781 if (vmx->emulation_required)
5782 return handle_invalid_guest_state(vcpu);
5783
5784 if (is_guest_mode(vcpu) && nested_vmx_exit_reflected(vcpu, exit_reason))
5785 return nested_vmx_reflect_vmexit(vcpu, exit_reason);
5786
5787 if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
5788 dump_vmcs();
5789 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
5790 vcpu->run->fail_entry.hardware_entry_failure_reason
5791 = exit_reason;
5792 return 0;
5793 }
5794
5795 if (unlikely(vmx->fail)) {
5796 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
5797 vcpu->run->fail_entry.hardware_entry_failure_reason
5798 = vmcs_read32(VM_INSTRUCTION_ERROR);
5799 return 0;
5800 }
5801
5802 /*
5803 * Note:
5804 * Do not try to fix EXIT_REASON_EPT_MISCONFIG if it caused by
5805 * delivery event since it indicates guest is accessing MMIO.
5806 * The vm-exit can be triggered again after return to guest that
5807 * will cause infinite loop.
5808 */
5809 if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
5810 (exit_reason != EXIT_REASON_EXCEPTION_NMI &&
5811 exit_reason != EXIT_REASON_EPT_VIOLATION &&
5812 exit_reason != EXIT_REASON_PML_FULL &&
5813 exit_reason != EXIT_REASON_TASK_SWITCH)) {
5814 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
5815 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV;
5816 vcpu->run->internal.ndata = 3;
5817 vcpu->run->internal.data[0] = vectoring_info;
5818 vcpu->run->internal.data[1] = exit_reason;
5819 vcpu->run->internal.data[2] = vcpu->arch.exit_qualification;
5820 if (exit_reason == EXIT_REASON_EPT_MISCONFIG) {
5821 vcpu->run->internal.ndata++;
5822 vcpu->run->internal.data[3] =
5823 vmcs_read64(GUEST_PHYSICAL_ADDRESS);
5824 }
5825 return 0;
5826 }
5827
5828 if (unlikely(!enable_vnmi &&
5829 vmx->loaded_vmcs->soft_vnmi_blocked)) {
5830 if (vmx_interrupt_allowed(vcpu)) {
5831 vmx->loaded_vmcs->soft_vnmi_blocked = 0;
5832 } else if (vmx->loaded_vmcs->vnmi_blocked_time > 1000000000LL &&
5833 vcpu->arch.nmi_pending) {
5834 /*
5835 * This CPU don't support us in finding the end of an
5836 * NMI-blocked window if the guest runs with IRQs
5837 * disabled. So we pull the trigger after 1 s of
5838 * futile waiting, but inform the user about this.
5839 */
5840 printk(KERN_WARNING "%s: Breaking out of NMI-blocked "
5841 "state on VCPU %d after 1 s timeout\n",
5842 __func__, vcpu->vcpu_id);
5843 vmx->loaded_vmcs->soft_vnmi_blocked = 0;
5844 }
5845 }
5846
5847 if (exit_reason < kvm_vmx_max_exit_handlers
5848 && kvm_vmx_exit_handlers[exit_reason])
5849 return kvm_vmx_exit_handlers[exit_reason](vcpu);
5850 else {
5851 vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n",
5852 exit_reason);
5853 kvm_queue_exception(vcpu, UD_VECTOR);
5854 return 1;
5855 }
5856}
5857
5858/*
5859 * Software based L1D cache flush which is used when microcode providing
5860 * the cache control MSR is not loaded.
5861 *
5862 * The L1D cache is 32 KiB on Nehalem and later microarchitectures, but to
5863 * flush it is required to read in 64 KiB because the replacement algorithm
5864 * is not exactly LRU. This could be sized at runtime via topology
5865 * information but as all relevant affected CPUs have 32KiB L1D cache size
5866 * there is no point in doing so.
5867 */
5868static void vmx_l1d_flush(struct kvm_vcpu *vcpu)
5869{
5870 int size = PAGE_SIZE << L1D_CACHE_ORDER;
5871
5872 /*
5873 * This code is only executed when the the flush mode is 'cond' or
5874 * 'always'
5875 */
5876 if (static_branch_likely(&vmx_l1d_flush_cond)) {
5877 bool flush_l1d;
5878
5879 /*
5880 * Clear the per-vcpu flush bit, it gets set again
5881 * either from vcpu_run() or from one of the unsafe
5882 * VMEXIT handlers.
5883 */
5884 flush_l1d = vcpu->arch.l1tf_flush_l1d;
5885 vcpu->arch.l1tf_flush_l1d = false;
5886
5887 /*
5888 * Clear the per-cpu flush bit, it gets set again from
5889 * the interrupt handlers.
5890 */
5891 flush_l1d |= kvm_get_cpu_l1tf_flush_l1d();
5892 kvm_clear_cpu_l1tf_flush_l1d();
5893
5894 if (!flush_l1d)
5895 return;
5896 }
5897
5898 vcpu->stat.l1d_flush++;
5899
5900 if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) {
5901 wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
5902 return;
5903 }
5904
5905 asm volatile(
5906 /* First ensure the pages are in the TLB */
5907 "xorl %%eax, %%eax\n"
5908 ".Lpopulate_tlb:\n\t"
5909 "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
5910 "addl $4096, %%eax\n\t"
5911 "cmpl %%eax, %[size]\n\t"
5912 "jne .Lpopulate_tlb\n\t"
5913 "xorl %%eax, %%eax\n\t"
5914 "cpuid\n\t"
5915 /* Now fill the cache */
5916 "xorl %%eax, %%eax\n"
5917 ".Lfill_cache:\n"
5918 "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
5919 "addl $64, %%eax\n\t"
5920 "cmpl %%eax, %[size]\n\t"
5921 "jne .Lfill_cache\n\t"
5922 "lfence\n"
5923 :: [flush_pages] "r" (vmx_l1d_flush_pages),
5924 [size] "r" (size)
5925 : "eax", "ebx", "ecx", "edx");
5926}
5927
5928static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
5929{
5930 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
5931
5932 if (is_guest_mode(vcpu) &&
5933 nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
5934 return;
5935
5936 if (irr == -1 || tpr < irr) {
5937 vmcs_write32(TPR_THRESHOLD, 0);
5938 return;
5939 }
5940
5941 vmcs_write32(TPR_THRESHOLD, irr);
5942}
5943
5944void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
5945{
5946 u32 sec_exec_control;
5947
5948 if (!lapic_in_kernel(vcpu))
5949 return;
5950
5951 if (!flexpriority_enabled &&
5952 !cpu_has_vmx_virtualize_x2apic_mode())
5953 return;
5954
5955 /* Postpone execution until vmcs01 is the current VMCS. */
5956 if (is_guest_mode(vcpu)) {
5957 to_vmx(vcpu)->nested.change_vmcs01_virtual_apic_mode = true;
5958 return;
5959 }
5960
5961 sec_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
5962 sec_exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
5963 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
5964
5965 switch (kvm_get_apic_mode(vcpu)) {
5966 case LAPIC_MODE_INVALID:
5967 WARN_ONCE(true, "Invalid local APIC state");
5968 case LAPIC_MODE_DISABLED:
5969 break;
5970 case LAPIC_MODE_XAPIC:
5971 if (flexpriority_enabled) {
5972 sec_exec_control |=
5973 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
5974 vmx_flush_tlb(vcpu, true);
5975 }
5976 break;
5977 case LAPIC_MODE_X2APIC:
5978 if (cpu_has_vmx_virtualize_x2apic_mode())
5979 sec_exec_control |=
5980 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
5981 break;
5982 }
5983 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control);
5984
5985 vmx_update_msr_bitmap(vcpu);
5986}
5987
5988static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa)
5989{
5990 if (!is_guest_mode(vcpu)) {
5991 vmcs_write64(APIC_ACCESS_ADDR, hpa);
5992 vmx_flush_tlb(vcpu, true);
5993 }
5994}
5995
5996static void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr)
5997{
5998 u16 status;
5999 u8 old;
6000
6001 if (max_isr == -1)
6002 max_isr = 0;
6003
6004 status = vmcs_read16(GUEST_INTR_STATUS);
6005 old = status >> 8;
6006 if (max_isr != old) {
6007 status &= 0xff;
6008 status |= max_isr << 8;
6009 vmcs_write16(GUEST_INTR_STATUS, status);
6010 }
6011}
6012
6013static void vmx_set_rvi(int vector)
6014{
6015 u16 status;
6016 u8 old;
6017
6018 if (vector == -1)
6019 vector = 0;
6020
6021 status = vmcs_read16(GUEST_INTR_STATUS);
6022 old = (u8)status & 0xff;
6023 if ((u8)vector != old) {
6024 status &= ~0xff;
6025 status |= (u8)vector;
6026 vmcs_write16(GUEST_INTR_STATUS, status);
6027 }
6028}
6029
6030static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
6031{
6032 /*
6033 * When running L2, updating RVI is only relevant when
6034 * vmcs12 virtual-interrupt-delivery enabled.
6035 * However, it can be enabled only when L1 also
6036 * intercepts external-interrupts and in that case
6037 * we should not update vmcs02 RVI but instead intercept
6038 * interrupt. Therefore, do nothing when running L2.
6039 */
6040 if (!is_guest_mode(vcpu))
6041 vmx_set_rvi(max_irr);
6042}
6043
6044static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
6045{
6046 struct vcpu_vmx *vmx = to_vmx(vcpu);
6047 int max_irr;
6048 bool max_irr_updated;
6049
6050 WARN_ON(!vcpu->arch.apicv_active);
6051 if (pi_test_on(&vmx->pi_desc)) {
6052 pi_clear_on(&vmx->pi_desc);
6053 /*
6054 * IOMMU can write to PIR.ON, so the barrier matters even on UP.
6055 * But on x86 this is just a compiler barrier anyway.
6056 */
6057 smp_mb__after_atomic();
6058 max_irr_updated =
6059 kvm_apic_update_irr(vcpu, vmx->pi_desc.pir, &max_irr);
6060
6061 /*
6062 * If we are running L2 and L1 has a new pending interrupt
6063 * which can be injected, we should re-evaluate
6064 * what should be done with this new L1 interrupt.
6065 * If L1 intercepts external-interrupts, we should
6066 * exit from L2 to L1. Otherwise, interrupt should be
6067 * delivered directly to L2.
6068 */
6069 if (is_guest_mode(vcpu) && max_irr_updated) {
6070 if (nested_exit_on_intr(vcpu))
6071 kvm_vcpu_exiting_guest_mode(vcpu);
6072 else
6073 kvm_make_request(KVM_REQ_EVENT, vcpu);
6074 }
6075 } else {
6076 max_irr = kvm_lapic_find_highest_irr(vcpu);
6077 }
6078 vmx_hwapic_irr_update(vcpu, max_irr);
6079 return max_irr;
6080}
6081
6082static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
6083{
6084 if (!kvm_vcpu_apicv_active(vcpu))
6085 return;
6086
6087 vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]);
6088 vmcs_write64(EOI_EXIT_BITMAP1, eoi_exit_bitmap[1]);
6089 vmcs_write64(EOI_EXIT_BITMAP2, eoi_exit_bitmap[2]);
6090 vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]);
6091}
6092
6093static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu)
6094{
6095 struct vcpu_vmx *vmx = to_vmx(vcpu);
6096
6097 pi_clear_on(&vmx->pi_desc);
6098 memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir));
6099}
6100
6101static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx)
6102{
6103 u32 exit_intr_info = 0;
6104 u16 basic_exit_reason = (u16)vmx->exit_reason;
6105
6106 if (!(basic_exit_reason == EXIT_REASON_MCE_DURING_VMENTRY
6107 || basic_exit_reason == EXIT_REASON_EXCEPTION_NMI))
6108 return;
6109
6110 if (!(vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY))
6111 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
6112 vmx->exit_intr_info = exit_intr_info;
6113
6114 /* if exit due to PF check for async PF */
6115 if (is_page_fault(exit_intr_info))
6116 vmx->vcpu.arch.apf.host_apf_reason = kvm_read_and_reset_pf_reason();
6117
6118 /* Handle machine checks before interrupts are enabled */
6119 if (basic_exit_reason == EXIT_REASON_MCE_DURING_VMENTRY ||
6120 is_machine_check(exit_intr_info))
6121 kvm_machine_check();
6122
6123 /* We need to handle NMIs before interrupts are enabled */
6124 if (is_nmi(exit_intr_info)) {
6125 kvm_before_interrupt(&vmx->vcpu);
6126 asm("int $2");
6127 kvm_after_interrupt(&vmx->vcpu);
6128 }
6129}
6130
6131static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
6132{
6133 u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
6134
6135 if ((exit_intr_info & (INTR_INFO_VALID_MASK | INTR_INFO_INTR_TYPE_MASK))
6136 == (INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR)) {
6137 unsigned int vector;
6138 unsigned long entry;
6139 gate_desc *desc;
6140 struct vcpu_vmx *vmx = to_vmx(vcpu);
6141#ifdef CONFIG_X86_64
6142 unsigned long tmp;
6143#endif
6144
6145 vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
6146 desc = (gate_desc *)vmx->host_idt_base + vector;
6147 entry = gate_offset(desc);
6148 asm volatile(
6149#ifdef CONFIG_X86_64
6150 "mov %%" _ASM_SP ", %[sp]\n\t"
6151 "and $0xfffffffffffffff0, %%" _ASM_SP "\n\t"
6152 "push $%c[ss]\n\t"
6153 "push %[sp]\n\t"
6154#endif
6155 "pushf\n\t"
6156 __ASM_SIZE(push) " $%c[cs]\n\t"
6157 CALL_NOSPEC
6158 :
6159#ifdef CONFIG_X86_64
6160 [sp]"=&r"(tmp),
6161#endif
6162 ASM_CALL_CONSTRAINT
6163 :
6164 THUNK_TARGET(entry),
6165 [ss]"i"(__KERNEL_DS),
6166 [cs]"i"(__KERNEL_CS)
6167 );
6168 }
6169}
6170STACK_FRAME_NON_STANDARD(vmx_handle_external_intr);
6171
6172static bool vmx_has_emulated_msr(int index)
6173{
6174 switch (index) {
6175 case MSR_IA32_SMBASE:
6176 /*
6177 * We cannot do SMM unless we can run the guest in big
6178 * real mode.
6179 */
6180 return enable_unrestricted_guest || emulate_invalid_guest_state;
6181 case MSR_AMD64_VIRT_SPEC_CTRL:
6182 /* This is AMD only. */
6183 return false;
6184 default:
6185 return true;
6186 }
6187}
6188
6189static bool vmx_pt_supported(void)
6190{
6191 return pt_mode == PT_MODE_HOST_GUEST;
6192}
6193
6194static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
6195{
6196 u32 exit_intr_info;
6197 bool unblock_nmi;
6198 u8 vector;
6199 bool idtv_info_valid;
6200
6201 idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK;
6202
6203 if (enable_vnmi) {
6204 if (vmx->loaded_vmcs->nmi_known_unmasked)
6205 return;
6206 /*
6207 * Can't use vmx->exit_intr_info since we're not sure what
6208 * the exit reason is.
6209 */
6210 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
6211 unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
6212 vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
6213 /*
6214 * SDM 3: 27.7.1.2 (September 2008)
6215 * Re-set bit "block by NMI" before VM entry if vmexit caused by
6216 * a guest IRET fault.
6217 * SDM 3: 23.2.2 (September 2008)
6218 * Bit 12 is undefined in any of the following cases:
6219 * If the VM exit sets the valid bit in the IDT-vectoring
6220 * information field.
6221 * If the VM exit is due to a double fault.
6222 */
6223 if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi &&
6224 vector != DF_VECTOR && !idtv_info_valid)
6225 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
6226 GUEST_INTR_STATE_NMI);
6227 else
6228 vmx->loaded_vmcs->nmi_known_unmasked =
6229 !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO)
6230 & GUEST_INTR_STATE_NMI);
6231 } else if (unlikely(vmx->loaded_vmcs->soft_vnmi_blocked))
6232 vmx->loaded_vmcs->vnmi_blocked_time +=
6233 ktime_to_ns(ktime_sub(ktime_get(),
6234 vmx->loaded_vmcs->entry_time));
6235}
6236
6237static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu,
6238 u32 idt_vectoring_info,
6239 int instr_len_field,
6240 int error_code_field)
6241{
6242 u8 vector;
6243 int type;
6244 bool idtv_info_valid;
6245
6246 idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
6247
6248 vcpu->arch.nmi_injected = false;
6249 kvm_clear_exception_queue(vcpu);
6250 kvm_clear_interrupt_queue(vcpu);
6251
6252 if (!idtv_info_valid)
6253 return;
6254
6255 kvm_make_request(KVM_REQ_EVENT, vcpu);
6256
6257 vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK;
6258 type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK;
6259
6260 switch (type) {
6261 case INTR_TYPE_NMI_INTR:
6262 vcpu->arch.nmi_injected = true;
6263 /*
6264 * SDM 3: 27.7.1.2 (September 2008)
6265 * Clear bit "block by NMI" before VM entry if a NMI
6266 * delivery faulted.
6267 */
6268 vmx_set_nmi_mask(vcpu, false);
6269 break;
6270 case INTR_TYPE_SOFT_EXCEPTION:
6271 vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
6272 /* fall through */
6273 case INTR_TYPE_HARD_EXCEPTION:
6274 if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
6275 u32 err = vmcs_read32(error_code_field);
6276 kvm_requeue_exception_e(vcpu, vector, err);
6277 } else
6278 kvm_requeue_exception(vcpu, vector);
6279 break;
6280 case INTR_TYPE_SOFT_INTR:
6281 vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
6282 /* fall through */
6283 case INTR_TYPE_EXT_INTR:
6284 kvm_queue_interrupt(vcpu, vector, type == INTR_TYPE_SOFT_INTR);
6285 break;
6286 default:
6287 break;
6288 }
6289}
6290
6291static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
6292{
6293 __vmx_complete_interrupts(&vmx->vcpu, vmx->idt_vectoring_info,
6294 VM_EXIT_INSTRUCTION_LEN,
6295 IDT_VECTORING_ERROR_CODE);
6296}
6297
6298static void vmx_cancel_injection(struct kvm_vcpu *vcpu)
6299{
6300 __vmx_complete_interrupts(vcpu,
6301 vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
6302 VM_ENTRY_INSTRUCTION_LEN,
6303 VM_ENTRY_EXCEPTION_ERROR_CODE);
6304
6305 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
6306}
6307
6308static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
6309{
6310 int i, nr_msrs;
6311 struct perf_guest_switch_msr *msrs;
6312
6313 msrs = perf_guest_get_msrs(&nr_msrs);
6314
6315 if (!msrs)
6316 return;
6317
6318 for (i = 0; i < nr_msrs; i++)
6319 if (msrs[i].host == msrs[i].guest)
6320 clear_atomic_switch_msr(vmx, msrs[i].msr);
6321 else
6322 add_atomic_switch_msr(vmx, msrs[i].msr, msrs[i].guest,
6323 msrs[i].host, false);
6324}
6325
6326static void vmx_arm_hv_timer(struct vcpu_vmx *vmx, u32 val)
6327{
6328 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, val);
6329 if (!vmx->loaded_vmcs->hv_timer_armed)
6330 vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL,
6331 PIN_BASED_VMX_PREEMPTION_TIMER);
6332 vmx->loaded_vmcs->hv_timer_armed = true;
6333}
6334
6335static void vmx_update_hv_timer(struct kvm_vcpu *vcpu)
6336{
6337 struct vcpu_vmx *vmx = to_vmx(vcpu);
6338 u64 tscl;
6339 u32 delta_tsc;
6340
6341 if (vmx->req_immediate_exit) {
6342 vmx_arm_hv_timer(vmx, 0);
6343 return;
6344 }
6345
6346 if (vmx->hv_deadline_tsc != -1) {
6347 tscl = rdtsc();
6348 if (vmx->hv_deadline_tsc > tscl)
6349 /* set_hv_timer ensures the delta fits in 32-bits */
6350 delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >>
6351 cpu_preemption_timer_multi);
6352 else
6353 delta_tsc = 0;
6354
6355 vmx_arm_hv_timer(vmx, delta_tsc);
6356 return;
6357 }
6358
6359 if (vmx->loaded_vmcs->hv_timer_armed)
6360 vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL,
6361 PIN_BASED_VMX_PREEMPTION_TIMER);
6362 vmx->loaded_vmcs->hv_timer_armed = false;
6363}
6364
6365static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
6366{
6367 struct vcpu_vmx *vmx = to_vmx(vcpu);
6368 unsigned long cr3, cr4, evmcs_rsp;
6369
6370 /* Record the guest's net vcpu time for enforced NMI injections. */
6371 if (unlikely(!enable_vnmi &&
6372 vmx->loaded_vmcs->soft_vnmi_blocked))
6373 vmx->loaded_vmcs->entry_time = ktime_get();
6374
6375 /* Don't enter VMX if guest state is invalid, let the exit handler
6376 start emulation until we arrive back to a valid state */
6377 if (vmx->emulation_required)
6378 return;
6379
6380 if (vmx->ple_window_dirty) {
6381 vmx->ple_window_dirty = false;
6382 vmcs_write32(PLE_WINDOW, vmx->ple_window);
6383 }
6384
6385 if (vmx->nested.need_vmcs12_sync)
6386 nested_sync_from_vmcs12(vcpu);
6387
6388 if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty))
6389 vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
6390 if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
6391 vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
6392
6393 cr3 = __get_current_cr3_fast();
6394 if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) {
6395 vmcs_writel(HOST_CR3, cr3);
6396 vmx->loaded_vmcs->host_state.cr3 = cr3;
6397 }
6398
6399 cr4 = cr4_read_shadow();
6400 if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) {
6401 vmcs_writel(HOST_CR4, cr4);
6402 vmx->loaded_vmcs->host_state.cr4 = cr4;
6403 }
6404
6405 /* When single-stepping over STI and MOV SS, we must clear the
6406 * corresponding interruptibility bits in the guest state. Otherwise
6407 * vmentry fails as it then expects bit 14 (BS) in pending debug
6408 * exceptions being set, but that's not correct for the guest debugging
6409 * case. */
6410 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
6411 vmx_set_interrupt_shadow(vcpu, 0);
6412
6413 if (static_cpu_has(X86_FEATURE_PKU) &&
6414 kvm_read_cr4_bits(vcpu, X86_CR4_PKE) &&
6415 vcpu->arch.pkru != vmx->host_pkru)
6416 __write_pkru(vcpu->arch.pkru);
6417
6418 pt_guest_enter(vmx);
6419
6420 atomic_switch_perf_msrs(vmx);
6421
6422 vmx_update_hv_timer(vcpu);
6423
6424 /*
6425 * If this vCPU has touched SPEC_CTRL, restore the guest's value if
6426 * it's non-zero. Since vmentry is serialising on affected CPUs, there
6427 * is no need to worry about the conditional branch over the wrmsr
6428 * being speculatively taken.
6429 */
6430 x86_spec_ctrl_set_guest(vmx->spec_ctrl, 0);
6431
6432 vmx->__launched = vmx->loaded_vmcs->launched;
6433
6434 evmcs_rsp = static_branch_unlikely(&enable_evmcs) ?
6435 (unsigned long)&current_evmcs->host_rsp : 0;
6436
6437 if (static_branch_unlikely(&vmx_l1d_should_flush))
6438 vmx_l1d_flush(vcpu);
6439
6440 asm(
6441 /* Store host registers */
6442 "push %%" _ASM_DX "; push %%" _ASM_BP ";"
6443 "push %%" _ASM_CX " \n\t" /* placeholder for guest rcx */
6444 "push %%" _ASM_CX " \n\t"
6445 "sub $%c[wordsize], %%" _ASM_SP "\n\t" /* temporarily adjust RSP for CALL */
6446 "cmp %%" _ASM_SP ", %c[host_rsp](%%" _ASM_CX ") \n\t"
6447 "je 1f \n\t"
6448 "mov %%" _ASM_SP ", %c[host_rsp](%%" _ASM_CX ") \n\t"
6449 /* Avoid VMWRITE when Enlightened VMCS is in use */
6450 "test %%" _ASM_SI ", %%" _ASM_SI " \n\t"
6451 "jz 2f \n\t"
6452 "mov %%" _ASM_SP ", (%%" _ASM_SI ") \n\t"
6453 "jmp 1f \n\t"
6454 "2: \n\t"
6455 __ex("vmwrite %%" _ASM_SP ", %%" _ASM_DX) "\n\t"
6456 "1: \n\t"
6457 "add $%c[wordsize], %%" _ASM_SP "\n\t" /* un-adjust RSP */
6458
6459 /* Reload cr2 if changed */
6460 "mov %c[cr2](%%" _ASM_CX "), %%" _ASM_AX " \n\t"
6461 "mov %%cr2, %%" _ASM_DX " \n\t"
6462 "cmp %%" _ASM_AX ", %%" _ASM_DX " \n\t"
6463 "je 3f \n\t"
6464 "mov %%" _ASM_AX", %%cr2 \n\t"
6465 "3: \n\t"
6466 /* Check if vmlaunch or vmresume is needed */
6467 "cmpl $0, %c[launched](%%" _ASM_CX ") \n\t"
6468 /* Load guest registers. Don't clobber flags. */
6469 "mov %c[rax](%%" _ASM_CX "), %%" _ASM_AX " \n\t"
6470 "mov %c[rbx](%%" _ASM_CX "), %%" _ASM_BX " \n\t"
6471 "mov %c[rdx](%%" _ASM_CX "), %%" _ASM_DX " \n\t"
6472 "mov %c[rsi](%%" _ASM_CX "), %%" _ASM_SI " \n\t"
6473 "mov %c[rdi](%%" _ASM_CX "), %%" _ASM_DI " \n\t"
6474 "mov %c[rbp](%%" _ASM_CX "), %%" _ASM_BP " \n\t"
6475#ifdef CONFIG_X86_64
6476 "mov %c[r8](%%" _ASM_CX "), %%r8 \n\t"
6477 "mov %c[r9](%%" _ASM_CX "), %%r9 \n\t"
6478 "mov %c[r10](%%" _ASM_CX "), %%r10 \n\t"
6479 "mov %c[r11](%%" _ASM_CX "), %%r11 \n\t"
6480 "mov %c[r12](%%" _ASM_CX "), %%r12 \n\t"
6481 "mov %c[r13](%%" _ASM_CX "), %%r13 \n\t"
6482 "mov %c[r14](%%" _ASM_CX "), %%r14 \n\t"
6483 "mov %c[r15](%%" _ASM_CX "), %%r15 \n\t"
6484#endif
6485 /* Load guest RCX. This kills the vmx_vcpu pointer! */
6486 "mov %c[rcx](%%" _ASM_CX "), %%" _ASM_CX " \n\t"
6487
6488 /* Enter guest mode */
6489 "call vmx_vmenter\n\t"
6490
6491 /* Save guest's RCX to the stack placeholder (see above) */
6492 "mov %%" _ASM_CX ", %c[wordsize](%%" _ASM_SP ") \n\t"
6493
6494 /* Load host's RCX, i.e. the vmx_vcpu pointer */
6495 "pop %%" _ASM_CX " \n\t"
6496
6497 /* Set vmx->fail based on EFLAGS.{CF,ZF} */
6498 "setbe %c[fail](%%" _ASM_CX ")\n\t"
6499
6500 /* Save all guest registers, including RCX from the stack */
6501 "mov %%" _ASM_AX ", %c[rax](%%" _ASM_CX ") \n\t"
6502 "mov %%" _ASM_BX ", %c[rbx](%%" _ASM_CX ") \n\t"
6503 __ASM_SIZE(pop) " %c[rcx](%%" _ASM_CX ") \n\t"
6504 "mov %%" _ASM_DX ", %c[rdx](%%" _ASM_CX ") \n\t"
6505 "mov %%" _ASM_SI ", %c[rsi](%%" _ASM_CX ") \n\t"
6506 "mov %%" _ASM_DI ", %c[rdi](%%" _ASM_CX ") \n\t"
6507 "mov %%" _ASM_BP ", %c[rbp](%%" _ASM_CX ") \n\t"
6508#ifdef CONFIG_X86_64
6509 "mov %%r8, %c[r8](%%" _ASM_CX ") \n\t"
6510 "mov %%r9, %c[r9](%%" _ASM_CX ") \n\t"
6511 "mov %%r10, %c[r10](%%" _ASM_CX ") \n\t"
6512 "mov %%r11, %c[r11](%%" _ASM_CX ") \n\t"
6513 "mov %%r12, %c[r12](%%" _ASM_CX ") \n\t"
6514 "mov %%r13, %c[r13](%%" _ASM_CX ") \n\t"
6515 "mov %%r14, %c[r14](%%" _ASM_CX ") \n\t"
6516 "mov %%r15, %c[r15](%%" _ASM_CX ") \n\t"
6517 /*
6518 * Clear host registers marked as clobbered to prevent
6519 * speculative use.
6520 */
6521 "xor %%r8d, %%r8d \n\t"
6522 "xor %%r9d, %%r9d \n\t"
6523 "xor %%r10d, %%r10d \n\t"
6524 "xor %%r11d, %%r11d \n\t"
6525 "xor %%r12d, %%r12d \n\t"
6526 "xor %%r13d, %%r13d \n\t"
6527 "xor %%r14d, %%r14d \n\t"
6528 "xor %%r15d, %%r15d \n\t"
6529#endif
6530 "mov %%cr2, %%" _ASM_AX " \n\t"
6531 "mov %%" _ASM_AX ", %c[cr2](%%" _ASM_CX ") \n\t"
6532
6533 "xor %%eax, %%eax \n\t"
6534 "xor %%ebx, %%ebx \n\t"
6535 "xor %%esi, %%esi \n\t"
6536 "xor %%edi, %%edi \n\t"
6537 "pop %%" _ASM_BP "; pop %%" _ASM_DX " \n\t"
6538 : ASM_CALL_CONSTRAINT
6539 : "c"(vmx), "d"((unsigned long)HOST_RSP), "S"(evmcs_rsp),
6540 [launched]"i"(offsetof(struct vcpu_vmx, __launched)),
6541 [fail]"i"(offsetof(struct vcpu_vmx, fail)),
6542 [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)),
6543 [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])),
6544 [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])),
6545 [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])),
6546 [rdx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDX])),
6547 [rsi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RSI])),
6548 [rdi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDI])),
6549 [rbp]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBP])),
6550#ifdef CONFIG_X86_64
6551 [r8]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R8])),
6552 [r9]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R9])),
6553 [r10]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R10])),
6554 [r11]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R11])),
6555 [r12]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R12])),
6556 [r13]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R13])),
6557 [r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])),
6558 [r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])),
6559#endif
6560 [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)),
6561 [wordsize]"i"(sizeof(ulong))
6562 : "cc", "memory"
6563#ifdef CONFIG_X86_64
6564 , "rax", "rbx", "rdi"
6565 , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
6566#else
6567 , "eax", "ebx", "edi"
6568#endif
6569 );
6570
6571 /*
6572 * We do not use IBRS in the kernel. If this vCPU has used the
6573 * SPEC_CTRL MSR it may have left it on; save the value and
6574 * turn it off. This is much more efficient than blindly adding
6575 * it to the atomic save/restore list. Especially as the former
6576 * (Saving guest MSRs on vmexit) doesn't even exist in KVM.
6577 *
6578 * For non-nested case:
6579 * If the L01 MSR bitmap does not intercept the MSR, then we need to
6580 * save it.
6581 *
6582 * For nested case:
6583 * If the L02 MSR bitmap does not intercept the MSR, then we need to
6584 * save it.
6585 */
6586 if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)))
6587 vmx->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
6588
6589 x86_spec_ctrl_restore_host(vmx->spec_ctrl, 0);
6590
6591 /* Eliminate branch target predictions from guest mode */
6592 vmexit_fill_RSB();
6593
6594 /* All fields are clean at this point */
6595 if (static_branch_unlikely(&enable_evmcs))
6596 current_evmcs->hv_clean_fields |=
6597 HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
6598
6599 /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
6600 if (vmx->host_debugctlmsr)
6601 update_debugctlmsr(vmx->host_debugctlmsr);
6602
6603#ifndef CONFIG_X86_64
6604 /*
6605 * The sysexit path does not restore ds/es, so we must set them to
6606 * a reasonable value ourselves.
6607 *
6608 * We can't defer this to vmx_prepare_switch_to_host() since that
6609 * function may be executed in interrupt context, which saves and
6610 * restore segments around it, nullifying its effect.
6611 */
6612 loadsegment(ds, __USER_DS);
6613 loadsegment(es, __USER_DS);
6614#endif
6615
6616 vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)
6617 | (1 << VCPU_EXREG_RFLAGS)
6618 | (1 << VCPU_EXREG_PDPTR)
6619 | (1 << VCPU_EXREG_SEGMENTS)
6620 | (1 << VCPU_EXREG_CR3));
6621 vcpu->arch.regs_dirty = 0;
6622
6623 pt_guest_exit(vmx);
6624
6625 /*
6626 * eager fpu is enabled if PKEY is supported and CR4 is switched
6627 * back on host, so it is safe to read guest PKRU from current
6628 * XSAVE.
6629 */
6630 if (static_cpu_has(X86_FEATURE_PKU) &&
6631 kvm_read_cr4_bits(vcpu, X86_CR4_PKE)) {
6632 vcpu->arch.pkru = __read_pkru();
6633 if (vcpu->arch.pkru != vmx->host_pkru)
6634 __write_pkru(vmx->host_pkru);
6635 }
6636
6637 vmx->nested.nested_run_pending = 0;
6638 vmx->idt_vectoring_info = 0;
6639
6640 vmx->exit_reason = vmx->fail ? 0xdead : vmcs_read32(VM_EXIT_REASON);
6641 if (vmx->fail || (vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY))
6642 return;
6643
6644 vmx->loaded_vmcs->launched = 1;
6645 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
6646
6647 vmx_complete_atomic_exit(vmx);
6648 vmx_recover_nmi_blocking(vmx);
6649 vmx_complete_interrupts(vmx);
6650}
6651STACK_FRAME_NON_STANDARD(vmx_vcpu_run);
6652
6653static struct kvm *vmx_vm_alloc(void)
6654{
6655 struct kvm_vmx *kvm_vmx = vzalloc(sizeof(struct kvm_vmx));
6656 return &kvm_vmx->kvm;
6657}
6658
6659static void vmx_vm_free(struct kvm *kvm)
6660{
6661 vfree(to_kvm_vmx(kvm));
6662}
6663
6664static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
6665{
6666 struct vcpu_vmx *vmx = to_vmx(vcpu);
6667
6668 if (enable_pml)
6669 vmx_destroy_pml_buffer(vmx);
6670 free_vpid(vmx->vpid);
6671 leave_guest_mode(vcpu);
6672 nested_vmx_free_vcpu(vcpu);
6673 free_loaded_vmcs(vmx->loaded_vmcs);
6674 kfree(vmx->guest_msrs);
6675 kvm_vcpu_uninit(vcpu);
6676 kmem_cache_free(x86_fpu_cache, vmx->vcpu.arch.guest_fpu);
6677 kmem_cache_free(kvm_vcpu_cache, vmx);
6678}
6679
6680static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
6681{
6682 int err;
6683 struct vcpu_vmx *vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
6684 unsigned long *msr_bitmap;
6685 int cpu;
6686
6687 if (!vmx)
6688 return ERR_PTR(-ENOMEM);
6689
6690 vmx->vcpu.arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache, GFP_KERNEL);
6691 if (!vmx->vcpu.arch.guest_fpu) {
6692 printk(KERN_ERR "kvm: failed to allocate vcpu's fpu\n");
6693 err = -ENOMEM;
6694 goto free_partial_vcpu;
6695 }
6696
6697 vmx->vpid = allocate_vpid();
6698
6699 err = kvm_vcpu_init(&vmx->vcpu, kvm, id);
6700 if (err)
6701 goto free_vcpu;
6702
6703 err = -ENOMEM;
6704
6705 /*
6706 * If PML is turned on, failure on enabling PML just results in failure
6707 * of creating the vcpu, therefore we can simplify PML logic (by
6708 * avoiding dealing with cases, such as enabling PML partially on vcpus
6709 * for the guest, etc.
6710 */
6711 if (enable_pml) {
6712 vmx->pml_pg = alloc_page(GFP_KERNEL | __GFP_ZERO);
6713 if (!vmx->pml_pg)
6714 goto uninit_vcpu;
6715 }
6716
6717 vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
6718 BUILD_BUG_ON(ARRAY_SIZE(vmx_msr_index) * sizeof(vmx->guest_msrs[0])
6719 > PAGE_SIZE);
6720
6721 if (!vmx->guest_msrs)
6722 goto free_pml;
6723
6724 err = alloc_loaded_vmcs(&vmx->vmcs01);
6725 if (err < 0)
6726 goto free_msrs;
6727
6728 msr_bitmap = vmx->vmcs01.msr_bitmap;
6729 vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_TSC, MSR_TYPE_R);
6730 vmx_disable_intercept_for_msr(msr_bitmap, MSR_FS_BASE, MSR_TYPE_RW);
6731 vmx_disable_intercept_for_msr(msr_bitmap, MSR_GS_BASE, MSR_TYPE_RW);
6732 vmx_disable_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
6733 vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
6734 vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW);
6735 vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW);
6736 vmx->msr_bitmap_mode = 0;
6737
6738 vmx->loaded_vmcs = &vmx->vmcs01;
6739 cpu = get_cpu();
6740 vmx_vcpu_load(&vmx->vcpu, cpu);
6741 vmx->vcpu.cpu = cpu;
6742 vmx_vcpu_setup(vmx);
6743 vmx_vcpu_put(&vmx->vcpu);
6744 put_cpu();
6745 if (cpu_need_virtualize_apic_accesses(&vmx->vcpu)) {
6746 err = alloc_apic_access_page(kvm);
6747 if (err)
6748 goto free_vmcs;
6749 }
6750
6751 if (enable_ept && !enable_unrestricted_guest) {
6752 err = init_rmode_identity_map(kvm);
6753 if (err)
6754 goto free_vmcs;
6755 }
6756
6757 if (nested)
6758 nested_vmx_setup_ctls_msrs(&vmx->nested.msrs,
6759 vmx_capability.ept,
6760 kvm_vcpu_apicv_active(&vmx->vcpu));
6761 else
6762 memset(&vmx->nested.msrs, 0, sizeof(vmx->nested.msrs));
6763
6764 vmx->nested.posted_intr_nv = -1;
6765 vmx->nested.current_vmptr = -1ull;
6766
6767 vmx->msr_ia32_feature_control_valid_bits = FEATURE_CONTROL_LOCKED;
6768
6769 /*
6770 * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR
6771 * or POSTED_INTR_WAKEUP_VECTOR.
6772 */
6773 vmx->pi_desc.nv = POSTED_INTR_VECTOR;
6774 vmx->pi_desc.sn = 1;
6775
6776 vmx->ept_pointer = INVALID_PAGE;
6777
6778 return &vmx->vcpu;
6779
6780free_vmcs:
6781 free_loaded_vmcs(vmx->loaded_vmcs);
6782free_msrs:
6783 kfree(vmx->guest_msrs);
6784free_pml:
6785 vmx_destroy_pml_buffer(vmx);
6786uninit_vcpu:
6787 kvm_vcpu_uninit(&vmx->vcpu);
6788free_vcpu:
6789 free_vpid(vmx->vpid);
6790 kmem_cache_free(x86_fpu_cache, vmx->vcpu.arch.guest_fpu);
6791free_partial_vcpu:
6792 kmem_cache_free(kvm_vcpu_cache, vmx);
6793 return ERR_PTR(err);
6794}
6795
6796#define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html for details.\n"
6797#define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html for details.\n"
6798
6799static int vmx_vm_init(struct kvm *kvm)
6800{
6801 spin_lock_init(&to_kvm_vmx(kvm)->ept_pointer_lock);
6802
6803 if (!ple_gap)
6804 kvm->arch.pause_in_guest = true;
6805
6806 if (boot_cpu_has(X86_BUG_L1TF) && enable_ept) {
6807 switch (l1tf_mitigation) {
6808 case L1TF_MITIGATION_OFF:
6809 case L1TF_MITIGATION_FLUSH_NOWARN:
6810 /* 'I explicitly don't care' is set */
6811 break;
6812 case L1TF_MITIGATION_FLUSH:
6813 case L1TF_MITIGATION_FLUSH_NOSMT:
6814 case L1TF_MITIGATION_FULL:
6815 /*
6816 * Warn upon starting the first VM in a potentially
6817 * insecure environment.
6818 */
6819 if (cpu_smt_control == CPU_SMT_ENABLED)
6820 pr_warn_once(L1TF_MSG_SMT);
6821 if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER)
6822 pr_warn_once(L1TF_MSG_L1D);
6823 break;
6824 case L1TF_MITIGATION_FULL_FORCE:
6825 /* Flush is enforced */
6826 break;
6827 }
6828 }
6829 return 0;
6830}
6831
6832static void __init vmx_check_processor_compat(void *rtn)
6833{
6834 struct vmcs_config vmcs_conf;
6835 struct vmx_capability vmx_cap;
6836
6837 *(int *)rtn = 0;
6838 if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0)
6839 *(int *)rtn = -EIO;
6840 if (nested)
6841 nested_vmx_setup_ctls_msrs(&vmcs_conf.nested, vmx_cap.ept,
6842 enable_apicv);
6843 if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) {
6844 printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n",
6845 smp_processor_id());
6846 *(int *)rtn = -EIO;
6847 }
6848}
6849
6850static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
6851{
6852 u8 cache;
6853 u64 ipat = 0;
6854
6855 /* For VT-d and EPT combination
6856 * 1. MMIO: always map as UC
6857 * 2. EPT with VT-d:
6858 * a. VT-d without snooping control feature: can't guarantee the
6859 * result, try to trust guest.
6860 * b. VT-d with snooping control feature: snooping control feature of
6861 * VT-d engine can guarantee the cache correctness. Just set it
6862 * to WB to keep consistent with host. So the same as item 3.
6863 * 3. EPT without VT-d: always map as WB and set IPAT=1 to keep
6864 * consistent with host MTRR
6865 */
6866 if (is_mmio) {
6867 cache = MTRR_TYPE_UNCACHABLE;
6868 goto exit;
6869 }
6870
6871 if (!kvm_arch_has_noncoherent_dma(vcpu->kvm)) {
6872 ipat = VMX_EPT_IPAT_BIT;
6873 cache = MTRR_TYPE_WRBACK;
6874 goto exit;
6875 }
6876
6877 if (kvm_read_cr0(vcpu) & X86_CR0_CD) {
6878 ipat = VMX_EPT_IPAT_BIT;
6879 if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
6880 cache = MTRR_TYPE_WRBACK;
6881 else
6882 cache = MTRR_TYPE_UNCACHABLE;
6883 goto exit;
6884 }
6885
6886 cache = kvm_mtrr_get_guest_memory_type(vcpu, gfn);
6887
6888exit:
6889 return (cache << VMX_EPT_MT_EPTE_SHIFT) | ipat;
6890}
6891
6892static int vmx_get_lpage_level(void)
6893{
6894 if (enable_ept && !cpu_has_vmx_ept_1g_page())
6895 return PT_DIRECTORY_LEVEL;
6896 else
6897 /* For shadow and EPT supported 1GB page */
6898 return PT_PDPE_LEVEL;
6899}
6900
6901static void vmcs_set_secondary_exec_control(u32 new_ctl)
6902{
6903 /*
6904 * These bits in the secondary execution controls field
6905 * are dynamic, the others are mostly based on the hypervisor
6906 * architecture and the guest's CPUID. Do not touch the
6907 * dynamic bits.
6908 */
6909 u32 mask =
6910 SECONDARY_EXEC_SHADOW_VMCS |
6911 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
6912 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
6913 SECONDARY_EXEC_DESC;
6914
6915 u32 cur_ctl = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
6916
6917 vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
6918 (new_ctl & ~mask) | (cur_ctl & mask));
6919}
6920
6921/*
6922 * Generate MSR_IA32_VMX_CR{0,4}_FIXED1 according to CPUID. Only set bits
6923 * (indicating "allowed-1") if they are supported in the guest's CPUID.
6924 */
6925static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu)
6926{
6927 struct vcpu_vmx *vmx = to_vmx(vcpu);
6928 struct kvm_cpuid_entry2 *entry;
6929
6930 vmx->nested.msrs.cr0_fixed1 = 0xffffffff;
6931 vmx->nested.msrs.cr4_fixed1 = X86_CR4_PCE;
6932
6933#define cr4_fixed1_update(_cr4_mask, _reg, _cpuid_mask) do { \
6934 if (entry && (entry->_reg & (_cpuid_mask))) \
6935 vmx->nested.msrs.cr4_fixed1 |= (_cr4_mask); \
6936} while (0)
6937
6938 entry = kvm_find_cpuid_entry(vcpu, 0x1, 0);
6939 cr4_fixed1_update(X86_CR4_VME, edx, bit(X86_FEATURE_VME));
6940 cr4_fixed1_update(X86_CR4_PVI, edx, bit(X86_FEATURE_VME));
6941 cr4_fixed1_update(X86_CR4_TSD, edx, bit(X86_FEATURE_TSC));
6942 cr4_fixed1_update(X86_CR4_DE, edx, bit(X86_FEATURE_DE));
6943 cr4_fixed1_update(X86_CR4_PSE, edx, bit(X86_FEATURE_PSE));
6944 cr4_fixed1_update(X86_CR4_PAE, edx, bit(X86_FEATURE_PAE));
6945 cr4_fixed1_update(X86_CR4_MCE, edx, bit(X86_FEATURE_MCE));
6946 cr4_fixed1_update(X86_CR4_PGE, edx, bit(X86_FEATURE_PGE));
6947 cr4_fixed1_update(X86_CR4_OSFXSR, edx, bit(X86_FEATURE_FXSR));
6948 cr4_fixed1_update(X86_CR4_OSXMMEXCPT, edx, bit(X86_FEATURE_XMM));
6949 cr4_fixed1_update(X86_CR4_VMXE, ecx, bit(X86_FEATURE_VMX));
6950 cr4_fixed1_update(X86_CR4_SMXE, ecx, bit(X86_FEATURE_SMX));
6951 cr4_fixed1_update(X86_CR4_PCIDE, ecx, bit(X86_FEATURE_PCID));
6952 cr4_fixed1_update(X86_CR4_OSXSAVE, ecx, bit(X86_FEATURE_XSAVE));
6953
6954 entry = kvm_find_cpuid_entry(vcpu, 0x7, 0);
6955 cr4_fixed1_update(X86_CR4_FSGSBASE, ebx, bit(X86_FEATURE_FSGSBASE));
6956 cr4_fixed1_update(X86_CR4_SMEP, ebx, bit(X86_FEATURE_SMEP));
6957 cr4_fixed1_update(X86_CR4_SMAP, ebx, bit(X86_FEATURE_SMAP));
6958 cr4_fixed1_update(X86_CR4_PKE, ecx, bit(X86_FEATURE_PKU));
6959 cr4_fixed1_update(X86_CR4_UMIP, ecx, bit(X86_FEATURE_UMIP));
6960
6961#undef cr4_fixed1_update
6962}
6963
6964static void nested_vmx_entry_exit_ctls_update(struct kvm_vcpu *vcpu)
6965{
6966 struct vcpu_vmx *vmx = to_vmx(vcpu);
6967
6968 if (kvm_mpx_supported()) {
6969 bool mpx_enabled = guest_cpuid_has(vcpu, X86_FEATURE_MPX);
6970
6971 if (mpx_enabled) {
6972 vmx->nested.msrs.entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS;
6973 vmx->nested.msrs.exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
6974 } else {
6975 vmx->nested.msrs.entry_ctls_high &= ~VM_ENTRY_LOAD_BNDCFGS;
6976 vmx->nested.msrs.exit_ctls_high &= ~VM_EXIT_CLEAR_BNDCFGS;
6977 }
6978 }
6979}
6980
6981static void update_intel_pt_cfg(struct kvm_vcpu *vcpu)
6982{
6983 struct vcpu_vmx *vmx = to_vmx(vcpu);
6984 struct kvm_cpuid_entry2 *best = NULL;
6985 int i;
6986
6987 for (i = 0; i < PT_CPUID_LEAVES; i++) {
6988 best = kvm_find_cpuid_entry(vcpu, 0x14, i);
6989 if (!best)
6990 return;
6991 vmx->pt_desc.caps[CPUID_EAX + i*PT_CPUID_REGS_NUM] = best->eax;
6992 vmx->pt_desc.caps[CPUID_EBX + i*PT_CPUID_REGS_NUM] = best->ebx;
6993 vmx->pt_desc.caps[CPUID_ECX + i*PT_CPUID_REGS_NUM] = best->ecx;
6994 vmx->pt_desc.caps[CPUID_EDX + i*PT_CPUID_REGS_NUM] = best->edx;
6995 }
6996
6997 /* Get the number of configurable Address Ranges for filtering */
6998 vmx->pt_desc.addr_range = intel_pt_validate_cap(vmx->pt_desc.caps,
6999 PT_CAP_num_address_ranges);
7000
7001 /* Initialize and clear the no dependency bits */
7002 vmx->pt_desc.ctl_bitmask = ~(RTIT_CTL_TRACEEN | RTIT_CTL_OS |
7003 RTIT_CTL_USR | RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC);
7004
7005 /*
7006 * If CPUID.(EAX=14H,ECX=0):EBX[0]=1 CR3Filter can be set otherwise
7007 * will inject an #GP
7008 */
7009 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_cr3_filtering))
7010 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_CR3EN;
7011
7012 /*
7013 * If CPUID.(EAX=14H,ECX=0):EBX[1]=1 CYCEn, CycThresh and
7014 * PSBFreq can be set
7015 */
7016 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc))
7017 vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_CYCLEACC |
7018 RTIT_CTL_CYC_THRESH | RTIT_CTL_PSB_FREQ);
7019
7020 /*
7021 * If CPUID.(EAX=14H,ECX=0):EBX[3]=1 MTCEn BranchEn and
7022 * MTCFreq can be set
7023 */
7024 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc))
7025 vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_MTC_EN |
7026 RTIT_CTL_BRANCH_EN | RTIT_CTL_MTC_RANGE);
7027
7028 /* If CPUID.(EAX=14H,ECX=0):EBX[4]=1 FUPonPTW and PTWEn can be set */
7029 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_ptwrite))
7030 vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_FUP_ON_PTW |
7031 RTIT_CTL_PTW_EN);
7032
7033 /* If CPUID.(EAX=14H,ECX=0):EBX[5]=1 PwrEvEn can be set */
7034 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_power_event_trace))
7035 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_PWR_EVT_EN;
7036
7037 /* If CPUID.(EAX=14H,ECX=0):ECX[0]=1 ToPA can be set */
7038 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_topa_output))
7039 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_TOPA;
7040
7041 /* If CPUID.(EAX=14H,ECX=0):ECX[3]=1 FabircEn can be set */
7042 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_output_subsys))
7043 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_FABRIC_EN;
7044
7045 /* unmask address range configure area */
7046 for (i = 0; i < vmx->pt_desc.addr_range; i++)
7047 vmx->pt_desc.ctl_bitmask &= ~(0xf << (32 + i * 4));
7048}
7049
7050static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
7051{
7052 struct vcpu_vmx *vmx = to_vmx(vcpu);
7053
7054 if (cpu_has_secondary_exec_ctrls()) {
7055 vmx_compute_secondary_exec_control(vmx);
7056 vmcs_set_secondary_exec_control(vmx->secondary_exec_control);
7057 }
7058
7059 if (nested_vmx_allowed(vcpu))
7060 to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
7061 FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
7062 else
7063 to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
7064 ~FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
7065
7066 if (nested_vmx_allowed(vcpu)) {
7067 nested_vmx_cr_fixed1_bits_update(vcpu);
7068 nested_vmx_entry_exit_ctls_update(vcpu);
7069 }
7070
7071 if (boot_cpu_has(X86_FEATURE_INTEL_PT) &&
7072 guest_cpuid_has(vcpu, X86_FEATURE_INTEL_PT))
7073 update_intel_pt_cfg(vcpu);
7074}
7075
7076static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
7077{
7078 if (func == 1 && nested)
7079 entry->ecx |= bit(X86_FEATURE_VMX);
7080}
7081
7082static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu)
7083{
7084 to_vmx(vcpu)->req_immediate_exit = true;
7085}
7086
7087static int vmx_check_intercept(struct kvm_vcpu *vcpu,
7088 struct x86_instruction_info *info,
7089 enum x86_intercept_stage stage)
7090{
7091 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
7092 struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
7093
7094 /*
7095 * RDPID causes #UD if disabled through secondary execution controls.
7096 * Because it is marked as EmulateOnUD, we need to intercept it here.
7097 */
7098 if (info->intercept == x86_intercept_rdtscp &&
7099 !nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDTSCP)) {
7100 ctxt->exception.vector = UD_VECTOR;
7101 ctxt->exception.error_code_valid = false;
7102 return X86EMUL_PROPAGATE_FAULT;
7103 }
7104
7105 /* TODO: check more intercepts... */
7106 return X86EMUL_CONTINUE;
7107}
7108
7109#ifdef CONFIG_X86_64
7110/* (a << shift) / divisor, return 1 if overflow otherwise 0 */
7111static inline int u64_shl_div_u64(u64 a, unsigned int shift,
7112 u64 divisor, u64 *result)
7113{
7114 u64 low = a << shift, high = a >> (64 - shift);
7115
7116 /* To avoid the overflow on divq */
7117 if (high >= divisor)
7118 return 1;
7119
7120 /* Low hold the result, high hold rem which is discarded */
7121 asm("divq %2\n\t" : "=a" (low), "=d" (high) :
7122 "rm" (divisor), "0" (low), "1" (high));
7123 *result = low;
7124
7125 return 0;
7126}
7127
7128static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc)
7129{
7130 struct vcpu_vmx *vmx;
7131 u64 tscl, guest_tscl, delta_tsc, lapic_timer_advance_cycles;
7132
7133 if (kvm_mwait_in_guest(vcpu->kvm))
7134 return -EOPNOTSUPP;
7135
7136 vmx = to_vmx(vcpu);
7137 tscl = rdtsc();
7138 guest_tscl = kvm_read_l1_tsc(vcpu, tscl);
7139 delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl;
7140 lapic_timer_advance_cycles = nsec_to_cycles(vcpu, lapic_timer_advance_ns);
7141
7142 if (delta_tsc > lapic_timer_advance_cycles)
7143 delta_tsc -= lapic_timer_advance_cycles;
7144 else
7145 delta_tsc = 0;
7146
7147 /* Convert to host delta tsc if tsc scaling is enabled */
7148 if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio &&
7149 u64_shl_div_u64(delta_tsc,
7150 kvm_tsc_scaling_ratio_frac_bits,
7151 vcpu->arch.tsc_scaling_ratio,
7152 &delta_tsc))
7153 return -ERANGE;
7154
7155 /*
7156 * If the delta tsc can't fit in the 32 bit after the multi shift,
7157 * we can't use the preemption timer.
7158 * It's possible that it fits on later vmentries, but checking
7159 * on every vmentry is costly so we just use an hrtimer.
7160 */
7161 if (delta_tsc >> (cpu_preemption_timer_multi + 32))
7162 return -ERANGE;
7163
7164 vmx->hv_deadline_tsc = tscl + delta_tsc;
7165 return delta_tsc == 0;
7166}
7167
7168static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu)
7169{
7170 to_vmx(vcpu)->hv_deadline_tsc = -1;
7171}
7172#endif
7173
7174static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu)
7175{
7176 if (!kvm_pause_in_guest(vcpu->kvm))
7177 shrink_ple_window(vcpu);
7178}
7179
7180static void vmx_slot_enable_log_dirty(struct kvm *kvm,
7181 struct kvm_memory_slot *slot)
7182{
7183 kvm_mmu_slot_leaf_clear_dirty(kvm, slot);
7184 kvm_mmu_slot_largepage_remove_write_access(kvm, slot);
7185}
7186
7187static void vmx_slot_disable_log_dirty(struct kvm *kvm,
7188 struct kvm_memory_slot *slot)
7189{
7190 kvm_mmu_slot_set_dirty(kvm, slot);
7191}
7192
7193static void vmx_flush_log_dirty(struct kvm *kvm)
7194{
7195 kvm_flush_pml_buffers(kvm);
7196}
7197
7198static int vmx_write_pml_buffer(struct kvm_vcpu *vcpu)
7199{
7200 struct vmcs12 *vmcs12;
7201 struct vcpu_vmx *vmx = to_vmx(vcpu);
7202 gpa_t gpa;
7203 struct page *page = NULL;
7204 u64 *pml_address;
7205
7206 if (is_guest_mode(vcpu)) {
7207 WARN_ON_ONCE(vmx->nested.pml_full);
7208
7209 /*
7210 * Check if PML is enabled for the nested guest.
7211 * Whether eptp bit 6 is set is already checked
7212 * as part of A/D emulation.
7213 */
7214 vmcs12 = get_vmcs12(vcpu);
7215 if (!nested_cpu_has_pml(vmcs12))
7216 return 0;
7217
7218 if (vmcs12->guest_pml_index >= PML_ENTITY_NUM) {
7219 vmx->nested.pml_full = true;
7220 return 1;
7221 }
7222
7223 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS) & ~0xFFFull;
7224
7225 page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->pml_address);
7226 if (is_error_page(page))
7227 return 0;
7228
7229 pml_address = kmap(page);
7230 pml_address[vmcs12->guest_pml_index--] = gpa;
7231 kunmap(page);
7232 kvm_release_page_clean(page);
7233 }
7234
7235 return 0;
7236}
7237
7238static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm,
7239 struct kvm_memory_slot *memslot,
7240 gfn_t offset, unsigned long mask)
7241{
7242 kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask);
7243}
7244
7245static void __pi_post_block(struct kvm_vcpu *vcpu)
7246{
7247 struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
7248 struct pi_desc old, new;
7249 unsigned int dest;
7250
7251 do {
7252 old.control = new.control = pi_desc->control;
7253 WARN(old.nv != POSTED_INTR_WAKEUP_VECTOR,
7254 "Wakeup handler not enabled while the VCPU is blocked\n");
7255
7256 dest = cpu_physical_id(vcpu->cpu);
7257
7258 if (x2apic_enabled())
7259 new.ndst = dest;
7260 else
7261 new.ndst = (dest << 8) & 0xFF00;
7262
7263 /* set 'NV' to 'notification vector' */
7264 new.nv = POSTED_INTR_VECTOR;
7265 } while (cmpxchg64(&pi_desc->control, old.control,
7266 new.control) != old.control);
7267
7268 if (!WARN_ON_ONCE(vcpu->pre_pcpu == -1)) {
7269 spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
7270 list_del(&vcpu->blocked_vcpu_list);
7271 spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
7272 vcpu->pre_pcpu = -1;
7273 }
7274}
7275
7276/*
7277 * This routine does the following things for vCPU which is going
7278 * to be blocked if VT-d PI is enabled.
7279 * - Store the vCPU to the wakeup list, so when interrupts happen
7280 * we can find the right vCPU to wake up.
7281 * - Change the Posted-interrupt descriptor as below:
7282 * 'NDST' <-- vcpu->pre_pcpu
7283 * 'NV' <-- POSTED_INTR_WAKEUP_VECTOR
7284 * - If 'ON' is set during this process, which means at least one
7285 * interrupt is posted for this vCPU, we cannot block it, in
7286 * this case, return 1, otherwise, return 0.
7287 *
7288 */
7289static int pi_pre_block(struct kvm_vcpu *vcpu)
7290{
7291 unsigned int dest;
7292 struct pi_desc old, new;
7293 struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
7294
7295 if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
7296 !irq_remapping_cap(IRQ_POSTING_CAP) ||
7297 !kvm_vcpu_apicv_active(vcpu))
7298 return 0;
7299
7300 WARN_ON(irqs_disabled());
7301 local_irq_disable();
7302 if (!WARN_ON_ONCE(vcpu->pre_pcpu != -1)) {
7303 vcpu->pre_pcpu = vcpu->cpu;
7304 spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
7305 list_add_tail(&vcpu->blocked_vcpu_list,
7306 &per_cpu(blocked_vcpu_on_cpu,
7307 vcpu->pre_pcpu));
7308 spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
7309 }
7310
7311 do {
7312 old.control = new.control = pi_desc->control;
7313
7314 WARN((pi_desc->sn == 1),
7315 "Warning: SN field of posted-interrupts "
7316 "is set before blocking\n");
7317
7318 /*
7319 * Since vCPU can be preempted during this process,
7320 * vcpu->cpu could be different with pre_pcpu, we
7321 * need to set pre_pcpu as the destination of wakeup
7322 * notification event, then we can find the right vCPU
7323 * to wakeup in wakeup handler if interrupts happen
7324 * when the vCPU is in blocked state.
7325 */
7326 dest = cpu_physical_id(vcpu->pre_pcpu);
7327
7328 if (x2apic_enabled())
7329 new.ndst = dest;
7330 else
7331 new.ndst = (dest << 8) & 0xFF00;
7332
7333 /* set 'NV' to 'wakeup vector' */
7334 new.nv = POSTED_INTR_WAKEUP_VECTOR;
7335 } while (cmpxchg64(&pi_desc->control, old.control,
7336 new.control) != old.control);
7337
7338 /* We should not block the vCPU if an interrupt is posted for it. */
7339 if (pi_test_on(pi_desc) == 1)
7340 __pi_post_block(vcpu);
7341
7342 local_irq_enable();
7343 return (vcpu->pre_pcpu == -1);
7344}
7345
7346static int vmx_pre_block(struct kvm_vcpu *vcpu)
7347{
7348 if (pi_pre_block(vcpu))
7349 return 1;
7350
7351 if (kvm_lapic_hv_timer_in_use(vcpu))
7352 kvm_lapic_switch_to_sw_timer(vcpu);
7353
7354 return 0;
7355}
7356
7357static void pi_post_block(struct kvm_vcpu *vcpu)
7358{
7359 if (vcpu->pre_pcpu == -1)
7360 return;
7361
7362 WARN_ON(irqs_disabled());
7363 local_irq_disable();
7364 __pi_post_block(vcpu);
7365 local_irq_enable();
7366}
7367
7368static void vmx_post_block(struct kvm_vcpu *vcpu)
7369{
7370 if (kvm_x86_ops->set_hv_timer)
7371 kvm_lapic_switch_to_hv_timer(vcpu);
7372
7373 pi_post_block(vcpu);
7374}
7375
7376/*
7377 * vmx_update_pi_irte - set IRTE for Posted-Interrupts
7378 *
7379 * @kvm: kvm
7380 * @host_irq: host irq of the interrupt
7381 * @guest_irq: gsi of the interrupt
7382 * @set: set or unset PI
7383 * returns 0 on success, < 0 on failure
7384 */
7385static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
7386 uint32_t guest_irq, bool set)
7387{
7388 struct kvm_kernel_irq_routing_entry *e;
7389 struct kvm_irq_routing_table *irq_rt;
7390 struct kvm_lapic_irq irq;
7391 struct kvm_vcpu *vcpu;
7392 struct vcpu_data vcpu_info;
7393 int idx, ret = 0;
7394
7395 if (!kvm_arch_has_assigned_device(kvm) ||
7396 !irq_remapping_cap(IRQ_POSTING_CAP) ||
7397 !kvm_vcpu_apicv_active(kvm->vcpus[0]))
7398 return 0;
7399
7400 idx = srcu_read_lock(&kvm->irq_srcu);
7401 irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
7402 if (guest_irq >= irq_rt->nr_rt_entries ||
7403 hlist_empty(&irq_rt->map[guest_irq])) {
7404 pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n",
7405 guest_irq, irq_rt->nr_rt_entries);
7406 goto out;
7407 }
7408
7409 hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
7410 if (e->type != KVM_IRQ_ROUTING_MSI)
7411 continue;
7412 /*
7413 * VT-d PI cannot support posting multicast/broadcast
7414 * interrupts to a vCPU, we still use interrupt remapping
7415 * for these kind of interrupts.
7416 *
7417 * For lowest-priority interrupts, we only support
7418 * those with single CPU as the destination, e.g. user
7419 * configures the interrupts via /proc/irq or uses
7420 * irqbalance to make the interrupts single-CPU.
7421 *
7422 * We will support full lowest-priority interrupt later.
7423 */
7424
7425 kvm_set_msi_irq(kvm, e, &irq);
7426 if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) {
7427 /*
7428 * Make sure the IRTE is in remapped mode if
7429 * we don't handle it in posted mode.
7430 */
7431 ret = irq_set_vcpu_affinity(host_irq, NULL);
7432 if (ret < 0) {
7433 printk(KERN_INFO
7434 "failed to back to remapped mode, irq: %u\n",
7435 host_irq);
7436 goto out;
7437 }
7438
7439 continue;
7440 }
7441
7442 vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu));
7443 vcpu_info.vector = irq.vector;
7444
7445 trace_kvm_pi_irte_update(host_irq, vcpu->vcpu_id, e->gsi,
7446 vcpu_info.vector, vcpu_info.pi_desc_addr, set);
7447
7448 if (set)
7449 ret = irq_set_vcpu_affinity(host_irq, &vcpu_info);
7450 else
7451 ret = irq_set_vcpu_affinity(host_irq, NULL);
7452
7453 if (ret < 0) {
7454 printk(KERN_INFO "%s: failed to update PI IRTE\n",
7455 __func__);
7456 goto out;
7457 }
7458 }
7459
7460 ret = 0;
7461out:
7462 srcu_read_unlock(&kvm->irq_srcu, idx);
7463 return ret;
7464}
7465
7466static void vmx_setup_mce(struct kvm_vcpu *vcpu)
7467{
7468 if (vcpu->arch.mcg_cap & MCG_LMCE_P)
7469 to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
7470 FEATURE_CONTROL_LMCE;
7471 else
7472 to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
7473 ~FEATURE_CONTROL_LMCE;
7474}
7475
7476static int vmx_smi_allowed(struct kvm_vcpu *vcpu)
7477{
7478 /* we need a nested vmexit to enter SMM, postpone if run is pending */
7479 if (to_vmx(vcpu)->nested.nested_run_pending)
7480 return 0;
7481 return 1;
7482}
7483
7484static int vmx_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
7485{
7486 struct vcpu_vmx *vmx = to_vmx(vcpu);
7487
7488 vmx->nested.smm.guest_mode = is_guest_mode(vcpu);
7489 if (vmx->nested.smm.guest_mode)
7490 nested_vmx_vmexit(vcpu, -1, 0, 0);
7491
7492 vmx->nested.smm.vmxon = vmx->nested.vmxon;
7493 vmx->nested.vmxon = false;
7494 vmx_clear_hlt(vcpu);
7495 return 0;
7496}
7497
7498static int vmx_pre_leave_smm(struct kvm_vcpu *vcpu, u64 smbase)
7499{
7500 struct vcpu_vmx *vmx = to_vmx(vcpu);
7501 int ret;
7502
7503 if (vmx->nested.smm.vmxon) {
7504 vmx->nested.vmxon = true;
7505 vmx->nested.smm.vmxon = false;
7506 }
7507
7508 if (vmx->nested.smm.guest_mode) {
7509 vcpu->arch.hflags &= ~HF_SMM_MASK;
7510 ret = nested_vmx_enter_non_root_mode(vcpu, false);
7511 vcpu->arch.hflags |= HF_SMM_MASK;
7512 if (ret)
7513 return ret;
7514
7515 vmx->nested.smm.guest_mode = false;
7516 }
7517 return 0;
7518}
7519
7520static int enable_smi_window(struct kvm_vcpu *vcpu)
7521{
7522 return 0;
7523}
7524
7525static __init int hardware_setup(void)
7526{
7527 unsigned long host_bndcfgs;
7528 int r, i;
7529
7530 rdmsrl_safe(MSR_EFER, &host_efer);
7531
7532 for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i)
7533 kvm_define_shared_msr(i, vmx_msr_index[i]);
7534
7535 if (setup_vmcs_config(&vmcs_config, &vmx_capability) < 0)
7536 return -EIO;
7537
7538 if (boot_cpu_has(X86_FEATURE_NX))
7539 kvm_enable_efer_bits(EFER_NX);
7540
7541 if (boot_cpu_has(X86_FEATURE_MPX)) {
7542 rdmsrl(MSR_IA32_BNDCFGS, host_bndcfgs);
7543 WARN_ONCE(host_bndcfgs, "KVM: BNDCFGS in host will be lost");
7544 }
7545
7546 if (boot_cpu_has(X86_FEATURE_XSAVES))
7547 rdmsrl(MSR_IA32_XSS, host_xss);
7548
7549 if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() ||
7550 !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global()))
7551 enable_vpid = 0;
7552
7553 if (!cpu_has_vmx_ept() ||
7554 !cpu_has_vmx_ept_4levels() ||
7555 !cpu_has_vmx_ept_mt_wb() ||
7556 !cpu_has_vmx_invept_global())
7557 enable_ept = 0;
7558
7559 if (!cpu_has_vmx_ept_ad_bits() || !enable_ept)
7560 enable_ept_ad_bits = 0;
7561
7562 if (!cpu_has_vmx_unrestricted_guest() || !enable_ept)
7563 enable_unrestricted_guest = 0;
7564
7565 if (!cpu_has_vmx_flexpriority())
7566 flexpriority_enabled = 0;
7567
7568 if (!cpu_has_virtual_nmis())
7569 enable_vnmi = 0;
7570
7571 /*
7572 * set_apic_access_page_addr() is used to reload apic access
7573 * page upon invalidation. No need to do anything if not
7574 * using the APIC_ACCESS_ADDR VMCS field.
7575 */
7576 if (!flexpriority_enabled)
7577 kvm_x86_ops->set_apic_access_page_addr = NULL;
7578
7579 if (!cpu_has_vmx_tpr_shadow())
7580 kvm_x86_ops->update_cr8_intercept = NULL;
7581
7582 if (enable_ept && !cpu_has_vmx_ept_2m_page())
7583 kvm_disable_largepages();
7584
7585#if IS_ENABLED(CONFIG_HYPERV)
7586 if (ms_hyperv.nested_features & HV_X64_NESTED_GUEST_MAPPING_FLUSH
7587 && enable_ept) {
7588 kvm_x86_ops->tlb_remote_flush = hv_remote_flush_tlb;
7589 kvm_x86_ops->tlb_remote_flush_with_range =
7590 hv_remote_flush_tlb_with_range;
7591 }
7592#endif
7593
7594 if (!cpu_has_vmx_ple()) {
7595 ple_gap = 0;
7596 ple_window = 0;
7597 ple_window_grow = 0;
7598 ple_window_max = 0;
7599 ple_window_shrink = 0;
7600 }
7601
7602 if (!cpu_has_vmx_apicv()) {
7603 enable_apicv = 0;
7604 kvm_x86_ops->sync_pir_to_irr = NULL;
7605 }
7606
7607 if (cpu_has_vmx_tsc_scaling()) {
7608 kvm_has_tsc_control = true;
7609 kvm_max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX;
7610 kvm_tsc_scaling_ratio_frac_bits = 48;
7611 }
7612
7613 set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
7614
7615 if (enable_ept)
7616 vmx_enable_tdp();
7617 else
7618 kvm_disable_tdp();
7619
7620 /*
7621 * Only enable PML when hardware supports PML feature, and both EPT
7622 * and EPT A/D bit features are enabled -- PML depends on them to work.
7623 */
7624 if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml())
7625 enable_pml = 0;
7626
7627 if (!enable_pml) {
7628 kvm_x86_ops->slot_enable_log_dirty = NULL;
7629 kvm_x86_ops->slot_disable_log_dirty = NULL;
7630 kvm_x86_ops->flush_log_dirty = NULL;
7631 kvm_x86_ops->enable_log_dirty_pt_masked = NULL;
7632 }
7633
7634 if (!cpu_has_vmx_preemption_timer())
7635 kvm_x86_ops->request_immediate_exit = __kvm_request_immediate_exit;
7636
7637 if (cpu_has_vmx_preemption_timer() && enable_preemption_timer) {
7638 u64 vmx_msr;
7639
7640 rdmsrl(MSR_IA32_VMX_MISC, vmx_msr);
7641 cpu_preemption_timer_multi =
7642 vmx_msr & VMX_MISC_PREEMPTION_TIMER_RATE_MASK;
7643 } else {
7644 kvm_x86_ops->set_hv_timer = NULL;
7645 kvm_x86_ops->cancel_hv_timer = NULL;
7646 }
7647
7648 kvm_set_posted_intr_wakeup_handler(wakeup_handler);
7649
7650 kvm_mce_cap_supported |= MCG_LMCE_P;
7651
7652 if (pt_mode != PT_MODE_SYSTEM && pt_mode != PT_MODE_HOST_GUEST)
7653 return -EINVAL;
7654 if (!enable_ept || !cpu_has_vmx_intel_pt())
7655 pt_mode = PT_MODE_SYSTEM;
7656
7657 if (nested) {
7658 nested_vmx_setup_ctls_msrs(&vmcs_config.nested,
7659 vmx_capability.ept, enable_apicv);
7660
7661 r = nested_vmx_hardware_setup(kvm_vmx_exit_handlers);
7662 if (r)
7663 return r;
7664 }
7665
7666 r = alloc_kvm_area();
7667 if (r)
7668 nested_vmx_hardware_unsetup();
7669 return r;
7670}
7671
7672static __exit void hardware_unsetup(void)
7673{
7674 if (nested)
7675 nested_vmx_hardware_unsetup();
7676
7677 free_kvm_area();
7678}
7679
7680static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
7681 .cpu_has_kvm_support = cpu_has_kvm_support,
7682 .disabled_by_bios = vmx_disabled_by_bios,
7683 .hardware_setup = hardware_setup,
7684 .hardware_unsetup = hardware_unsetup,
7685 .check_processor_compatibility = vmx_check_processor_compat,
7686 .hardware_enable = hardware_enable,
7687 .hardware_disable = hardware_disable,
7688 .cpu_has_accelerated_tpr = report_flexpriority,
7689 .has_emulated_msr = vmx_has_emulated_msr,
7690
7691 .vm_init = vmx_vm_init,
7692 .vm_alloc = vmx_vm_alloc,
7693 .vm_free = vmx_vm_free,
7694
7695 .vcpu_create = vmx_create_vcpu,
7696 .vcpu_free = vmx_free_vcpu,
7697 .vcpu_reset = vmx_vcpu_reset,
7698
7699 .prepare_guest_switch = vmx_prepare_switch_to_guest,
7700 .vcpu_load = vmx_vcpu_load,
7701 .vcpu_put = vmx_vcpu_put,
7702
7703 .update_bp_intercept = update_exception_bitmap,
7704 .get_msr_feature = vmx_get_msr_feature,
7705 .get_msr = vmx_get_msr,
7706 .set_msr = vmx_set_msr,
7707 .get_segment_base = vmx_get_segment_base,
7708 .get_segment = vmx_get_segment,
7709 .set_segment = vmx_set_segment,
7710 .get_cpl = vmx_get_cpl,
7711 .get_cs_db_l_bits = vmx_get_cs_db_l_bits,
7712 .decache_cr0_guest_bits = vmx_decache_cr0_guest_bits,
7713 .decache_cr3 = vmx_decache_cr3,
7714 .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits,
7715 .set_cr0 = vmx_set_cr0,
7716 .set_cr3 = vmx_set_cr3,
7717 .set_cr4 = vmx_set_cr4,
7718 .set_efer = vmx_set_efer,
7719 .get_idt = vmx_get_idt,
7720 .set_idt = vmx_set_idt,
7721 .get_gdt = vmx_get_gdt,
7722 .set_gdt = vmx_set_gdt,
7723 .get_dr6 = vmx_get_dr6,
7724 .set_dr6 = vmx_set_dr6,
7725 .set_dr7 = vmx_set_dr7,
7726 .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs,
7727 .cache_reg = vmx_cache_reg,
7728 .get_rflags = vmx_get_rflags,
7729 .set_rflags = vmx_set_rflags,
7730
7731 .tlb_flush = vmx_flush_tlb,
7732 .tlb_flush_gva = vmx_flush_tlb_gva,
7733
7734 .run = vmx_vcpu_run,
7735 .handle_exit = vmx_handle_exit,
7736 .skip_emulated_instruction = skip_emulated_instruction,
7737 .set_interrupt_shadow = vmx_set_interrupt_shadow,
7738 .get_interrupt_shadow = vmx_get_interrupt_shadow,
7739 .patch_hypercall = vmx_patch_hypercall,
7740 .set_irq = vmx_inject_irq,
7741 .set_nmi = vmx_inject_nmi,
7742 .queue_exception = vmx_queue_exception,
7743 .cancel_injection = vmx_cancel_injection,
7744 .interrupt_allowed = vmx_interrupt_allowed,
7745 .nmi_allowed = vmx_nmi_allowed,
7746 .get_nmi_mask = vmx_get_nmi_mask,
7747 .set_nmi_mask = vmx_set_nmi_mask,
7748 .enable_nmi_window = enable_nmi_window,
7749 .enable_irq_window = enable_irq_window,
7750 .update_cr8_intercept = update_cr8_intercept,
7751 .set_virtual_apic_mode = vmx_set_virtual_apic_mode,
7752 .set_apic_access_page_addr = vmx_set_apic_access_page_addr,
7753 .get_enable_apicv = vmx_get_enable_apicv,
7754 .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl,
7755 .load_eoi_exitmap = vmx_load_eoi_exitmap,
7756 .apicv_post_state_restore = vmx_apicv_post_state_restore,
7757 .hwapic_irr_update = vmx_hwapic_irr_update,
7758 .hwapic_isr_update = vmx_hwapic_isr_update,
7759 .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt,
7760 .sync_pir_to_irr = vmx_sync_pir_to_irr,
7761 .deliver_posted_interrupt = vmx_deliver_posted_interrupt,
7762
7763 .set_tss_addr = vmx_set_tss_addr,
7764 .set_identity_map_addr = vmx_set_identity_map_addr,
7765 .get_tdp_level = get_ept_level,
7766 .get_mt_mask = vmx_get_mt_mask,
7767
7768 .get_exit_info = vmx_get_exit_info,
7769
7770 .get_lpage_level = vmx_get_lpage_level,
7771
7772 .cpuid_update = vmx_cpuid_update,
7773
7774 .rdtscp_supported = vmx_rdtscp_supported,
7775 .invpcid_supported = vmx_invpcid_supported,
7776
7777 .set_supported_cpuid = vmx_set_supported_cpuid,
7778
7779 .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
7780
7781 .read_l1_tsc_offset = vmx_read_l1_tsc_offset,
7782 .write_l1_tsc_offset = vmx_write_l1_tsc_offset,
7783
7784 .set_tdp_cr3 = vmx_set_cr3,
7785
7786 .check_intercept = vmx_check_intercept,
7787 .handle_external_intr = vmx_handle_external_intr,
7788 .mpx_supported = vmx_mpx_supported,
7789 .xsaves_supported = vmx_xsaves_supported,
7790 .umip_emulated = vmx_umip_emulated,
7791 .pt_supported = vmx_pt_supported,
7792
7793 .request_immediate_exit = vmx_request_immediate_exit,
7794
7795 .sched_in = vmx_sched_in,
7796
7797 .slot_enable_log_dirty = vmx_slot_enable_log_dirty,
7798 .slot_disable_log_dirty = vmx_slot_disable_log_dirty,
7799 .flush_log_dirty = vmx_flush_log_dirty,
7800 .enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked,
7801 .write_log_dirty = vmx_write_pml_buffer,
7802
7803 .pre_block = vmx_pre_block,
7804 .post_block = vmx_post_block,
7805
7806 .pmu_ops = &intel_pmu_ops,
7807
7808 .update_pi_irte = vmx_update_pi_irte,
7809
7810#ifdef CONFIG_X86_64
7811 .set_hv_timer = vmx_set_hv_timer,
7812 .cancel_hv_timer = vmx_cancel_hv_timer,
7813#endif
7814
7815 .setup_mce = vmx_setup_mce,
7816
7817 .smi_allowed = vmx_smi_allowed,
7818 .pre_enter_smm = vmx_pre_enter_smm,
7819 .pre_leave_smm = vmx_pre_leave_smm,
7820 .enable_smi_window = enable_smi_window,
7821
7822 .check_nested_events = NULL,
7823 .get_nested_state = NULL,
7824 .set_nested_state = NULL,
7825 .get_vmcs12_pages = NULL,
7826 .nested_enable_evmcs = NULL,
7827};
7828
7829static void vmx_cleanup_l1d_flush(void)
7830{
7831 if (vmx_l1d_flush_pages) {
7832 free_pages((unsigned long)vmx_l1d_flush_pages, L1D_CACHE_ORDER);
7833 vmx_l1d_flush_pages = NULL;
7834 }
7835 /* Restore state so sysfs ignores VMX */
7836 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO;
7837}
7838
7839static void vmx_exit(void)
7840{
7841#ifdef CONFIG_KEXEC_CORE
7842 RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL);
7843 synchronize_rcu();
7844#endif
7845
7846 kvm_exit();
7847
7848#if IS_ENABLED(CONFIG_HYPERV)
7849 if (static_branch_unlikely(&enable_evmcs)) {
7850 int cpu;
7851 struct hv_vp_assist_page *vp_ap;
7852 /*
7853 * Reset everything to support using non-enlightened VMCS
7854 * access later (e.g. when we reload the module with
7855 * enlightened_vmcs=0)
7856 */
7857 for_each_online_cpu(cpu) {
7858 vp_ap = hv_get_vp_assist_page(cpu);
7859
7860 if (!vp_ap)
7861 continue;
7862
7863 vp_ap->current_nested_vmcs = 0;
7864 vp_ap->enlighten_vmentry = 0;
7865 }
7866
7867 static_branch_disable(&enable_evmcs);
7868 }
7869#endif
7870 vmx_cleanup_l1d_flush();
7871}
7872module_exit(vmx_exit);
7873
7874static int __init vmx_init(void)
7875{
7876 int r;
7877
7878#if IS_ENABLED(CONFIG_HYPERV)
7879 /*
7880 * Enlightened VMCS usage should be recommended and the host needs
7881 * to support eVMCS v1 or above. We can also disable eVMCS support
7882 * with module parameter.
7883 */
7884 if (enlightened_vmcs &&
7885 ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED &&
7886 (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >=
7887 KVM_EVMCS_VERSION) {
7888 int cpu;
7889
7890 /* Check that we have assist pages on all online CPUs */
7891 for_each_online_cpu(cpu) {
7892 if (!hv_get_vp_assist_page(cpu)) {
7893 enlightened_vmcs = false;
7894 break;
7895 }
7896 }
7897
7898 if (enlightened_vmcs) {
7899 pr_info("KVM: vmx: using Hyper-V Enlightened VMCS\n");
7900 static_branch_enable(&enable_evmcs);
7901 }
7902 } else {
7903 enlightened_vmcs = false;
7904 }
7905#endif
7906
7907 r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
7908 __alignof__(struct vcpu_vmx), THIS_MODULE);
7909 if (r)
7910 return r;
7911
7912 /*
7913 * Must be called after kvm_init() so enable_ept is properly set
7914 * up. Hand the parameter mitigation value in which was stored in
7915 * the pre module init parser. If no parameter was given, it will
7916 * contain 'auto' which will be turned into the default 'cond'
7917 * mitigation mode.
7918 */
7919 if (boot_cpu_has(X86_BUG_L1TF)) {
7920 r = vmx_setup_l1d_flush(vmentry_l1d_flush_param);
7921 if (r) {
7922 vmx_exit();
7923 return r;
7924 }
7925 }
7926
7927#ifdef CONFIG_KEXEC_CORE
7928 rcu_assign_pointer(crash_vmclear_loaded_vmcss,
7929 crash_vmclear_local_loaded_vmcss);
7930#endif
7931 vmx_check_vmcs12_offsets();
7932
7933 return 0;
7934}
7935module_init(vmx_init);
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
new file mode 100644
index 000000000000..99328954c2fc
--- /dev/null
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -0,0 +1,519 @@
1/* SPDX-License-Identifier: GPL-2.0 */
2#ifndef __KVM_X86_VMX_H
3#define __KVM_X86_VMX_H
4
5#include <linux/kvm_host.h>
6
7#include <asm/kvm.h>
8#include <asm/intel_pt.h>
9
10#include "capabilities.h"
11#include "ops.h"
12#include "vmcs.h"
13
14extern const u32 vmx_msr_index[];
15extern u64 host_efer;
16
17#define MSR_TYPE_R 1
18#define MSR_TYPE_W 2
19#define MSR_TYPE_RW 3
20
21#define X2APIC_MSR(r) (APIC_BASE_MSR + ((r) >> 4))
22
23#define NR_AUTOLOAD_MSRS 8
24
25struct vmx_msrs {
26 unsigned int nr;
27 struct vmx_msr_entry val[NR_AUTOLOAD_MSRS];
28};
29
30struct shared_msr_entry {
31 unsigned index;
32 u64 data;
33 u64 mask;
34};
35
36enum segment_cache_field {
37 SEG_FIELD_SEL = 0,
38 SEG_FIELD_BASE = 1,
39 SEG_FIELD_LIMIT = 2,
40 SEG_FIELD_AR = 3,
41
42 SEG_FIELD_NR = 4
43};
44
45/* Posted-Interrupt Descriptor */
46struct pi_desc {
47 u32 pir[8]; /* Posted interrupt requested */
48 union {
49 struct {
50 /* bit 256 - Outstanding Notification */
51 u16 on : 1,
52 /* bit 257 - Suppress Notification */
53 sn : 1,
54 /* bit 271:258 - Reserved */
55 rsvd_1 : 14;
56 /* bit 279:272 - Notification Vector */
57 u8 nv;
58 /* bit 287:280 - Reserved */
59 u8 rsvd_2;
60 /* bit 319:288 - Notification Destination */
61 u32 ndst;
62 };
63 u64 control;
64 };
65 u32 rsvd[6];
66} __aligned(64);
67
68#define RTIT_ADDR_RANGE 4
69
70struct pt_ctx {
71 u64 ctl;
72 u64 status;
73 u64 output_base;
74 u64 output_mask;
75 u64 cr3_match;
76 u64 addr_a[RTIT_ADDR_RANGE];
77 u64 addr_b[RTIT_ADDR_RANGE];
78};
79
80struct pt_desc {
81 u64 ctl_bitmask;
82 u32 addr_range;
83 u32 caps[PT_CPUID_REGS_NUM * PT_CPUID_LEAVES];
84 struct pt_ctx host;
85 struct pt_ctx guest;
86};
87
88/*
89 * The nested_vmx structure is part of vcpu_vmx, and holds information we need
90 * for correct emulation of VMX (i.e., nested VMX) on this vcpu.
91 */
92struct nested_vmx {
93 /* Has the level1 guest done vmxon? */
94 bool vmxon;
95 gpa_t vmxon_ptr;
96 bool pml_full;
97
98 /* The guest-physical address of the current VMCS L1 keeps for L2 */
99 gpa_t current_vmptr;
100 /*
101 * Cache of the guest's VMCS, existing outside of guest memory.
102 * Loaded from guest memory during VMPTRLD. Flushed to guest
103 * memory during VMCLEAR and VMPTRLD.
104 */
105 struct vmcs12 *cached_vmcs12;
106 /*
107 * Cache of the guest's shadow VMCS, existing outside of guest
108 * memory. Loaded from guest memory during VM entry. Flushed
109 * to guest memory during VM exit.
110 */
111 struct vmcs12 *cached_shadow_vmcs12;
112 /*
113 * Indicates if the shadow vmcs or enlightened vmcs must be updated
114 * with the data held by struct vmcs12.
115 */
116 bool need_vmcs12_sync;
117 bool dirty_vmcs12;
118
119 /*
120 * vmcs02 has been initialized, i.e. state that is constant for
121 * vmcs02 has been written to the backing VMCS. Initialization
122 * is delayed until L1 actually attempts to run a nested VM.
123 */
124 bool vmcs02_initialized;
125
126 bool change_vmcs01_virtual_apic_mode;
127
128 /*
129 * Enlightened VMCS has been enabled. It does not mean that L1 has to
130 * use it. However, VMX features available to L1 will be limited based
131 * on what the enlightened VMCS supports.
132 */
133 bool enlightened_vmcs_enabled;
134
135 /* L2 must run next, and mustn't decide to exit to L1. */
136 bool nested_run_pending;
137
138 struct loaded_vmcs vmcs02;
139
140 /*
141 * Guest pages referred to in the vmcs02 with host-physical
142 * pointers, so we must keep them pinned while L2 runs.
143 */
144 struct page *apic_access_page;
145 struct page *virtual_apic_page;
146 struct page *pi_desc_page;
147 struct pi_desc *pi_desc;
148 bool pi_pending;
149 u16 posted_intr_nv;
150
151 struct hrtimer preemption_timer;
152 bool preemption_timer_expired;
153
154 /* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */
155 u64 vmcs01_debugctl;
156 u64 vmcs01_guest_bndcfgs;
157
158 u16 vpid02;
159 u16 last_vpid;
160
161 struct nested_vmx_msrs msrs;
162
163 /* SMM related state */
164 struct {
165 /* in VMX operation on SMM entry? */
166 bool vmxon;
167 /* in guest mode on SMM entry? */
168 bool guest_mode;
169 } smm;
170
171 gpa_t hv_evmcs_vmptr;
172 struct page *hv_evmcs_page;
173 struct hv_enlightened_vmcs *hv_evmcs;
174};
175
176struct vcpu_vmx {
177 struct kvm_vcpu vcpu;
178 unsigned long host_rsp;
179 u8 fail;
180 u8 msr_bitmap_mode;
181 u32 exit_intr_info;
182 u32 idt_vectoring_info;
183 ulong rflags;
184 struct shared_msr_entry *guest_msrs;
185 int nmsrs;
186 int save_nmsrs;
187 bool guest_msrs_dirty;
188 unsigned long host_idt_base;
189#ifdef CONFIG_X86_64
190 u64 msr_host_kernel_gs_base;
191 u64 msr_guest_kernel_gs_base;
192#endif
193
194 u64 arch_capabilities;
195 u64 spec_ctrl;
196
197 u32 vm_entry_controls_shadow;
198 u32 vm_exit_controls_shadow;
199 u32 secondary_exec_control;
200
201 /*
202 * loaded_vmcs points to the VMCS currently used in this vcpu. For a
203 * non-nested (L1) guest, it always points to vmcs01. For a nested
204 * guest (L2), it points to a different VMCS. loaded_cpu_state points
205 * to the VMCS whose state is loaded into the CPU registers that only
206 * need to be switched when transitioning to/from the kernel; a NULL
207 * value indicates that host state is loaded.
208 */
209 struct loaded_vmcs vmcs01;
210 struct loaded_vmcs *loaded_vmcs;
211 struct loaded_vmcs *loaded_cpu_state;
212 bool __launched; /* temporary, used in vmx_vcpu_run */
213 struct msr_autoload {
214 struct vmx_msrs guest;
215 struct vmx_msrs host;
216 } msr_autoload;
217
218 struct {
219 int vm86_active;
220 ulong save_rflags;
221 struct kvm_segment segs[8];
222 } rmode;
223 struct {
224 u32 bitmask; /* 4 bits per segment (1 bit per field) */
225 struct kvm_save_segment {
226 u16 selector;
227 unsigned long base;
228 u32 limit;
229 u32 ar;
230 } seg[8];
231 } segment_cache;
232 int vpid;
233 bool emulation_required;
234
235 u32 exit_reason;
236
237 /* Posted interrupt descriptor */
238 struct pi_desc pi_desc;
239
240 /* Support for a guest hypervisor (nested VMX) */
241 struct nested_vmx nested;
242
243 /* Dynamic PLE window. */
244 int ple_window;
245 bool ple_window_dirty;
246
247 bool req_immediate_exit;
248
249 /* Support for PML */
250#define PML_ENTITY_NUM 512
251 struct page *pml_pg;
252
253 /* apic deadline value in host tsc */
254 u64 hv_deadline_tsc;
255
256 u64 current_tsc_ratio;
257
258 u32 host_pkru;
259
260 unsigned long host_debugctlmsr;
261
262 /*
263 * Only bits masked by msr_ia32_feature_control_valid_bits can be set in
264 * msr_ia32_feature_control. FEATURE_CONTROL_LOCKED is always included
265 * in msr_ia32_feature_control_valid_bits.
266 */
267 u64 msr_ia32_feature_control;
268 u64 msr_ia32_feature_control_valid_bits;
269 u64 ept_pointer;
270
271 struct pt_desc pt_desc;
272};
273
274enum ept_pointers_status {
275 EPT_POINTERS_CHECK = 0,
276 EPT_POINTERS_MATCH = 1,
277 EPT_POINTERS_MISMATCH = 2
278};
279
280struct kvm_vmx {
281 struct kvm kvm;
282
283 unsigned int tss_addr;
284 bool ept_identity_pagetable_done;
285 gpa_t ept_identity_map_addr;
286
287 enum ept_pointers_status ept_pointers_match;
288 spinlock_t ept_pointer_lock;
289};
290
291bool nested_vmx_allowed(struct kvm_vcpu *vcpu);
292void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
293void vmx_vcpu_put(struct kvm_vcpu *vcpu);
294int allocate_vpid(void);
295void free_vpid(int vpid);
296void vmx_set_constant_host_state(struct vcpu_vmx *vmx);
297void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu);
298int vmx_get_cpl(struct kvm_vcpu *vcpu);
299unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu);
300void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
301u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu);
302void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask);
303void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer);
304void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
305void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
306int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
307void set_cr4_guest_host_mask(struct vcpu_vmx *vmx);
308void ept_save_pdptrs(struct kvm_vcpu *vcpu);
309void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
310void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
311u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa);
312void update_exception_bitmap(struct kvm_vcpu *vcpu);
313void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu);
314bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu);
315void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked);
316void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu);
317struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr);
318void pt_update_intercept_for_msr(struct vcpu_vmx *vmx);
319
320#define POSTED_INTR_ON 0
321#define POSTED_INTR_SN 1
322
323static inline bool pi_test_and_set_on(struct pi_desc *pi_desc)
324{
325 return test_and_set_bit(POSTED_INTR_ON,
326 (unsigned long *)&pi_desc->control);
327}
328
329static inline bool pi_test_and_clear_on(struct pi_desc *pi_desc)
330{
331 return test_and_clear_bit(POSTED_INTR_ON,
332 (unsigned long *)&pi_desc->control);
333}
334
335static inline int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
336{
337 return test_and_set_bit(vector, (unsigned long *)pi_desc->pir);
338}
339
340static inline void pi_clear_sn(struct pi_desc *pi_desc)
341{
342 return clear_bit(POSTED_INTR_SN,
343 (unsigned long *)&pi_desc->control);
344}
345
346static inline void pi_set_sn(struct pi_desc *pi_desc)
347{
348 return set_bit(POSTED_INTR_SN,
349 (unsigned long *)&pi_desc->control);
350}
351
352static inline void pi_clear_on(struct pi_desc *pi_desc)
353{
354 clear_bit(POSTED_INTR_ON,
355 (unsigned long *)&pi_desc->control);
356}
357
358static inline int pi_test_on(struct pi_desc *pi_desc)
359{
360 return test_bit(POSTED_INTR_ON,
361 (unsigned long *)&pi_desc->control);
362}
363
364static inline int pi_test_sn(struct pi_desc *pi_desc)
365{
366 return test_bit(POSTED_INTR_SN,
367 (unsigned long *)&pi_desc->control);
368}
369
370static inline u8 vmx_get_rvi(void)
371{
372 return vmcs_read16(GUEST_INTR_STATUS) & 0xff;
373}
374
375static inline void vm_entry_controls_reset_shadow(struct vcpu_vmx *vmx)
376{
377 vmx->vm_entry_controls_shadow = vmcs_read32(VM_ENTRY_CONTROLS);
378}
379
380static inline void vm_entry_controls_init(struct vcpu_vmx *vmx, u32 val)
381{
382 vmcs_write32(VM_ENTRY_CONTROLS, val);
383 vmx->vm_entry_controls_shadow = val;
384}
385
386static inline void vm_entry_controls_set(struct vcpu_vmx *vmx, u32 val)
387{
388 if (vmx->vm_entry_controls_shadow != val)
389 vm_entry_controls_init(vmx, val);
390}
391
392static inline u32 vm_entry_controls_get(struct vcpu_vmx *vmx)
393{
394 return vmx->vm_entry_controls_shadow;
395}
396
397static inline void vm_entry_controls_setbit(struct vcpu_vmx *vmx, u32 val)
398{
399 vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) | val);
400}
401
402static inline void vm_entry_controls_clearbit(struct vcpu_vmx *vmx, u32 val)
403{
404 vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) & ~val);
405}
406
407static inline void vm_exit_controls_reset_shadow(struct vcpu_vmx *vmx)
408{
409 vmx->vm_exit_controls_shadow = vmcs_read32(VM_EXIT_CONTROLS);
410}
411
412static inline void vm_exit_controls_init(struct vcpu_vmx *vmx, u32 val)
413{
414 vmcs_write32(VM_EXIT_CONTROLS, val);
415 vmx->vm_exit_controls_shadow = val;
416}
417
418static inline void vm_exit_controls_set(struct vcpu_vmx *vmx, u32 val)
419{
420 if (vmx->vm_exit_controls_shadow != val)
421 vm_exit_controls_init(vmx, val);
422}
423
424static inline u32 vm_exit_controls_get(struct vcpu_vmx *vmx)
425{
426 return vmx->vm_exit_controls_shadow;
427}
428
429static inline void vm_exit_controls_setbit(struct vcpu_vmx *vmx, u32 val)
430{
431 vm_exit_controls_set(vmx, vm_exit_controls_get(vmx) | val);
432}
433
434static inline void vm_exit_controls_clearbit(struct vcpu_vmx *vmx, u32 val)
435{
436 vm_exit_controls_set(vmx, vm_exit_controls_get(vmx) & ~val);
437}
438
439static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
440{
441 vmx->segment_cache.bitmask = 0;
442}
443
444static inline u32 vmx_vmentry_ctrl(void)
445{
446 u32 vmentry_ctrl = vmcs_config.vmentry_ctrl;
447 if (pt_mode == PT_MODE_SYSTEM)
448 vmentry_ctrl &= ~(VM_EXIT_PT_CONCEAL_PIP | VM_EXIT_CLEAR_IA32_RTIT_CTL);
449 /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */
450 return vmentry_ctrl &
451 ~(VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | VM_ENTRY_LOAD_IA32_EFER);
452}
453
454static inline u32 vmx_vmexit_ctrl(void)
455{
456 u32 vmexit_ctrl = vmcs_config.vmexit_ctrl;
457 if (pt_mode == PT_MODE_SYSTEM)
458 vmexit_ctrl &= ~(VM_ENTRY_PT_CONCEAL_PIP | VM_ENTRY_LOAD_IA32_RTIT_CTL);
459 /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */
460 return vmcs_config.vmexit_ctrl &
461 ~(VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | VM_EXIT_LOAD_IA32_EFER);
462}
463
464u32 vmx_exec_control(struct vcpu_vmx *vmx);
465
466static inline struct kvm_vmx *to_kvm_vmx(struct kvm *kvm)
467{
468 return container_of(kvm, struct kvm_vmx, kvm);
469}
470
471static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
472{
473 return container_of(vcpu, struct vcpu_vmx, vcpu);
474}
475
476static inline struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu)
477{
478 return &(to_vmx(vcpu)->pi_desc);
479}
480
481struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu);
482void free_vmcs(struct vmcs *vmcs);
483int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs);
484void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs);
485void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs);
486void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs);
487
488static inline struct vmcs *alloc_vmcs(bool shadow)
489{
490 return alloc_vmcs_cpu(shadow, raw_smp_processor_id());
491}
492
493u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa);
494
495static inline void __vmx_flush_tlb(struct kvm_vcpu *vcpu, int vpid,
496 bool invalidate_gpa)
497{
498 if (enable_ept && (invalidate_gpa || !enable_vpid)) {
499 if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
500 return;
501 ept_sync_context(construct_eptp(vcpu,
502 vcpu->arch.mmu->root_hpa));
503 } else {
504 vpid_sync_context(vpid);
505 }
506}
507
508static inline void vmx_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa)
509{
510 __vmx_flush_tlb(vcpu, to_vmx(vcpu)->vpid, invalidate_gpa);
511}
512
513static inline void decache_tsc_multiplier(struct vcpu_vmx *vmx)
514{
515 vmx->current_tsc_ratio = vmx->vcpu.arch.tsc_scaling_ratio;
516 vmcs_write64(TSC_MULTIPLIER, vmx->current_tsc_ratio);
517}
518
519#endif /* __KVM_X86_VMX_H */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index f049ecfac7bb..02c8e095a239 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -69,6 +69,7 @@
69#include <asm/irq_remapping.h> 69#include <asm/irq_remapping.h>
70#include <asm/mshyperv.h> 70#include <asm/mshyperv.h>
71#include <asm/hypervisor.h> 71#include <asm/hypervisor.h>
72#include <asm/intel_pt.h>
72 73
73#define CREATE_TRACE_POINTS 74#define CREATE_TRACE_POINTS
74#include "trace.h" 75#include "trace.h"
@@ -213,6 +214,9 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
213 214
214u64 __read_mostly host_xcr0; 215u64 __read_mostly host_xcr0;
215 216
217struct kmem_cache *x86_fpu_cache;
218EXPORT_SYMBOL_GPL(x86_fpu_cache);
219
216static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt); 220static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt);
217 221
218static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu) 222static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
@@ -1121,7 +1125,13 @@ static u32 msrs_to_save[] = {
1121#endif 1125#endif
1122 MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA, 1126 MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
1123 MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS, MSR_TSC_AUX, 1127 MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS, MSR_TSC_AUX,
1124 MSR_IA32_SPEC_CTRL, MSR_IA32_ARCH_CAPABILITIES 1128 MSR_IA32_SPEC_CTRL, MSR_IA32_ARCH_CAPABILITIES,
1129 MSR_IA32_RTIT_CTL, MSR_IA32_RTIT_STATUS, MSR_IA32_RTIT_CR3_MATCH,
1130 MSR_IA32_RTIT_OUTPUT_BASE, MSR_IA32_RTIT_OUTPUT_MASK,
1131 MSR_IA32_RTIT_ADDR0_A, MSR_IA32_RTIT_ADDR0_B,
1132 MSR_IA32_RTIT_ADDR1_A, MSR_IA32_RTIT_ADDR1_B,
1133 MSR_IA32_RTIT_ADDR2_A, MSR_IA32_RTIT_ADDR2_B,
1134 MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B,
1125}; 1135};
1126 1136
1127static unsigned num_msrs_to_save; 1137static unsigned num_msrs_to_save;
@@ -2999,6 +3009,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
2999 case KVM_CAP_HYPERV_TLBFLUSH: 3009 case KVM_CAP_HYPERV_TLBFLUSH:
3000 case KVM_CAP_HYPERV_SEND_IPI: 3010 case KVM_CAP_HYPERV_SEND_IPI:
3001 case KVM_CAP_HYPERV_ENLIGHTENED_VMCS: 3011 case KVM_CAP_HYPERV_ENLIGHTENED_VMCS:
3012 case KVM_CAP_HYPERV_CPUID:
3002 case KVM_CAP_PCI_SEGMENT: 3013 case KVM_CAP_PCI_SEGMENT:
3003 case KVM_CAP_DEBUGREGS: 3014 case KVM_CAP_DEBUGREGS:
3004 case KVM_CAP_X86_ROBUST_SINGLESTEP: 3015 case KVM_CAP_X86_ROBUST_SINGLESTEP:
@@ -3010,7 +3021,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
3010 case KVM_CAP_HYPERV_TIME: 3021 case KVM_CAP_HYPERV_TIME:
3011 case KVM_CAP_IOAPIC_POLARITY_IGNORED: 3022 case KVM_CAP_IOAPIC_POLARITY_IGNORED:
3012 case KVM_CAP_TSC_DEADLINE_TIMER: 3023 case KVM_CAP_TSC_DEADLINE_TIMER:
3013 case KVM_CAP_ENABLE_CAP_VM:
3014 case KVM_CAP_DISABLE_QUIRKS: 3024 case KVM_CAP_DISABLE_QUIRKS:
3015 case KVM_CAP_SET_BOOT_CPU_ID: 3025 case KVM_CAP_SET_BOOT_CPU_ID:
3016 case KVM_CAP_SPLIT_IRQCHIP: 3026 case KVM_CAP_SPLIT_IRQCHIP:
@@ -3632,7 +3642,7 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
3632 3642
3633static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu) 3643static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu)
3634{ 3644{
3635 struct xregs_state *xsave = &vcpu->arch.guest_fpu.state.xsave; 3645 struct xregs_state *xsave = &vcpu->arch.guest_fpu->state.xsave;
3636 u64 xstate_bv = xsave->header.xfeatures; 3646 u64 xstate_bv = xsave->header.xfeatures;
3637 u64 valid; 3647 u64 valid;
3638 3648
@@ -3674,7 +3684,7 @@ static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu)
3674 3684
3675static void load_xsave(struct kvm_vcpu *vcpu, u8 *src) 3685static void load_xsave(struct kvm_vcpu *vcpu, u8 *src)
3676{ 3686{
3677 struct xregs_state *xsave = &vcpu->arch.guest_fpu.state.xsave; 3687 struct xregs_state *xsave = &vcpu->arch.guest_fpu->state.xsave;
3678 u64 xstate_bv = *(u64 *)(src + XSAVE_HDR_OFFSET); 3688 u64 xstate_bv = *(u64 *)(src + XSAVE_HDR_OFFSET);
3679 u64 valid; 3689 u64 valid;
3680 3690
@@ -3722,7 +3732,7 @@ static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
3722 fill_xsave((u8 *) guest_xsave->region, vcpu); 3732 fill_xsave((u8 *) guest_xsave->region, vcpu);
3723 } else { 3733 } else {
3724 memcpy(guest_xsave->region, 3734 memcpy(guest_xsave->region,
3725 &vcpu->arch.guest_fpu.state.fxsave, 3735 &vcpu->arch.guest_fpu->state.fxsave,
3726 sizeof(struct fxregs_state)); 3736 sizeof(struct fxregs_state));
3727 *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] = 3737 *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] =
3728 XFEATURE_MASK_FPSSE; 3738 XFEATURE_MASK_FPSSE;
@@ -3752,7 +3762,7 @@ static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
3752 if (xstate_bv & ~XFEATURE_MASK_FPSSE || 3762 if (xstate_bv & ~XFEATURE_MASK_FPSSE ||
3753 mxcsr & ~mxcsr_feature_mask) 3763 mxcsr & ~mxcsr_feature_mask)
3754 return -EINVAL; 3764 return -EINVAL;
3755 memcpy(&vcpu->arch.guest_fpu.state.fxsave, 3765 memcpy(&vcpu->arch.guest_fpu->state.fxsave,
3756 guest_xsave->region, sizeof(struct fxregs_state)); 3766 guest_xsave->region, sizeof(struct fxregs_state));
3757 } 3767 }
3758 return 0; 3768 return 0;
@@ -3830,6 +3840,8 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
3830 return kvm_hv_activate_synic(vcpu, cap->cap == 3840 return kvm_hv_activate_synic(vcpu, cap->cap ==
3831 KVM_CAP_HYPERV_SYNIC2); 3841 KVM_CAP_HYPERV_SYNIC2);
3832 case KVM_CAP_HYPERV_ENLIGHTENED_VMCS: 3842 case KVM_CAP_HYPERV_ENLIGHTENED_VMCS:
3843 if (!kvm_x86_ops->nested_enable_evmcs)
3844 return -ENOTTY;
3833 r = kvm_x86_ops->nested_enable_evmcs(vcpu, &vmcs_version); 3845 r = kvm_x86_ops->nested_enable_evmcs(vcpu, &vmcs_version);
3834 if (!r) { 3846 if (!r) {
3835 user_ptr = (void __user *)(uintptr_t)cap->args[0]; 3847 user_ptr = (void __user *)(uintptr_t)cap->args[0];
@@ -4192,6 +4204,25 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
4192 r = kvm_x86_ops->set_nested_state(vcpu, user_kvm_nested_state, &kvm_state); 4204 r = kvm_x86_ops->set_nested_state(vcpu, user_kvm_nested_state, &kvm_state);
4193 break; 4205 break;
4194 } 4206 }
4207 case KVM_GET_SUPPORTED_HV_CPUID: {
4208 struct kvm_cpuid2 __user *cpuid_arg = argp;
4209 struct kvm_cpuid2 cpuid;
4210
4211 r = -EFAULT;
4212 if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
4213 goto out;
4214
4215 r = kvm_vcpu_ioctl_get_hv_cpuid(vcpu, &cpuid,
4216 cpuid_arg->entries);
4217 if (r)
4218 goto out;
4219
4220 r = -EFAULT;
4221 if (copy_to_user(cpuid_arg, &cpuid, sizeof(cpuid)))
4222 goto out;
4223 r = 0;
4224 break;
4225 }
4195 default: 4226 default:
4196 r = -EINVAL; 4227 r = -EINVAL;
4197 } 4228 }
@@ -4396,7 +4427,7 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm,
4396 */ 4427 */
4397int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) 4428int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
4398{ 4429{
4399 bool is_dirty = false; 4430 bool flush = false;
4400 int r; 4431 int r;
4401 4432
4402 mutex_lock(&kvm->slots_lock); 4433 mutex_lock(&kvm->slots_lock);
@@ -4407,14 +4438,41 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
4407 if (kvm_x86_ops->flush_log_dirty) 4438 if (kvm_x86_ops->flush_log_dirty)
4408 kvm_x86_ops->flush_log_dirty(kvm); 4439 kvm_x86_ops->flush_log_dirty(kvm);
4409 4440
4410 r = kvm_get_dirty_log_protect(kvm, log, &is_dirty); 4441 r = kvm_get_dirty_log_protect(kvm, log, &flush);
4411 4442
4412 /* 4443 /*
4413 * All the TLBs can be flushed out of mmu lock, see the comments in 4444 * All the TLBs can be flushed out of mmu lock, see the comments in
4414 * kvm_mmu_slot_remove_write_access(). 4445 * kvm_mmu_slot_remove_write_access().
4415 */ 4446 */
4416 lockdep_assert_held(&kvm->slots_lock); 4447 lockdep_assert_held(&kvm->slots_lock);
4417 if (is_dirty) 4448 if (flush)
4449 kvm_flush_remote_tlbs(kvm);
4450
4451 mutex_unlock(&kvm->slots_lock);
4452 return r;
4453}
4454
4455int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm, struct kvm_clear_dirty_log *log)
4456{
4457 bool flush = false;
4458 int r;
4459
4460 mutex_lock(&kvm->slots_lock);
4461
4462 /*
4463 * Flush potentially hardware-cached dirty pages to dirty_bitmap.
4464 */
4465 if (kvm_x86_ops->flush_log_dirty)
4466 kvm_x86_ops->flush_log_dirty(kvm);
4467
4468 r = kvm_clear_dirty_log_protect(kvm, log, &flush);
4469
4470 /*
4471 * All the TLBs can be flushed out of mmu lock, see the comments in
4472 * kvm_mmu_slot_remove_write_access().
4473 */
4474 lockdep_assert_held(&kvm->slots_lock);
4475 if (flush)
4418 kvm_flush_remote_tlbs(kvm); 4476 kvm_flush_remote_tlbs(kvm);
4419 4477
4420 mutex_unlock(&kvm->slots_lock); 4478 mutex_unlock(&kvm->slots_lock);
@@ -4433,8 +4491,8 @@ int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event,
4433 return 0; 4491 return 0;
4434} 4492}
4435 4493
4436static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, 4494int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
4437 struct kvm_enable_cap *cap) 4495 struct kvm_enable_cap *cap)
4438{ 4496{
4439 int r; 4497 int r;
4440 4498
@@ -4767,15 +4825,6 @@ set_identity_unlock:
4767 r = 0; 4825 r = 0;
4768 break; 4826 break;
4769 } 4827 }
4770 case KVM_ENABLE_CAP: {
4771 struct kvm_enable_cap cap;
4772
4773 r = -EFAULT;
4774 if (copy_from_user(&cap, argp, sizeof(cap)))
4775 goto out;
4776 r = kvm_vm_ioctl_enable_cap(kvm, &cap);
4777 break;
4778 }
4779 case KVM_MEMORY_ENCRYPT_OP: { 4828 case KVM_MEMORY_ENCRYPT_OP: {
4780 r = -ENOTTY; 4829 r = -ENOTTY;
4781 if (kvm_x86_ops->mem_enc_op) 4830 if (kvm_x86_ops->mem_enc_op)
@@ -4844,6 +4893,30 @@ static void kvm_init_msr_list(void)
4844 if (!kvm_x86_ops->rdtscp_supported()) 4893 if (!kvm_x86_ops->rdtscp_supported())
4845 continue; 4894 continue;
4846 break; 4895 break;
4896 case MSR_IA32_RTIT_CTL:
4897 case MSR_IA32_RTIT_STATUS:
4898 if (!kvm_x86_ops->pt_supported())
4899 continue;
4900 break;
4901 case MSR_IA32_RTIT_CR3_MATCH:
4902 if (!kvm_x86_ops->pt_supported() ||
4903 !intel_pt_validate_hw_cap(PT_CAP_cr3_filtering))
4904 continue;
4905 break;
4906 case MSR_IA32_RTIT_OUTPUT_BASE:
4907 case MSR_IA32_RTIT_OUTPUT_MASK:
4908 if (!kvm_x86_ops->pt_supported() ||
4909 (!intel_pt_validate_hw_cap(PT_CAP_topa_output) &&
4910 !intel_pt_validate_hw_cap(PT_CAP_single_range_output)))
4911 continue;
4912 break;
4913 case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: {
4914 if (!kvm_x86_ops->pt_supported() ||
4915 msrs_to_save[i] - MSR_IA32_RTIT_ADDR0_A >=
4916 intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2)
4917 continue;
4918 break;
4919 }
4847 default: 4920 default:
4848 break; 4921 break;
4849 } 4922 }
@@ -6815,11 +6888,30 @@ int kvm_arch_init(void *opaque)
6815 goto out; 6888 goto out;
6816 } 6889 }
6817 6890
6891 /*
6892 * KVM explicitly assumes that the guest has an FPU and
6893 * FXSAVE/FXRSTOR. For example, the KVM_GET_FPU explicitly casts the
6894 * vCPU's FPU state as a fxregs_state struct.
6895 */
6896 if (!boot_cpu_has(X86_FEATURE_FPU) || !boot_cpu_has(X86_FEATURE_FXSR)) {
6897 printk(KERN_ERR "kvm: inadequate fpu\n");
6898 r = -EOPNOTSUPP;
6899 goto out;
6900 }
6901
6818 r = -ENOMEM; 6902 r = -ENOMEM;
6903 x86_fpu_cache = kmem_cache_create("x86_fpu", sizeof(struct fpu),
6904 __alignof__(struct fpu), SLAB_ACCOUNT,
6905 NULL);
6906 if (!x86_fpu_cache) {
6907 printk(KERN_ERR "kvm: failed to allocate cache for x86 fpu\n");
6908 goto out;
6909 }
6910
6819 shared_msrs = alloc_percpu(struct kvm_shared_msrs); 6911 shared_msrs = alloc_percpu(struct kvm_shared_msrs);
6820 if (!shared_msrs) { 6912 if (!shared_msrs) {
6821 printk(KERN_ERR "kvm: failed to allocate percpu kvm_shared_msrs\n"); 6913 printk(KERN_ERR "kvm: failed to allocate percpu kvm_shared_msrs\n");
6822 goto out; 6914 goto out_free_x86_fpu_cache;
6823 } 6915 }
6824 6916
6825 r = kvm_mmu_module_init(); 6917 r = kvm_mmu_module_init();
@@ -6852,6 +6944,8 @@ int kvm_arch_init(void *opaque)
6852 6944
6853out_free_percpu: 6945out_free_percpu:
6854 free_percpu(shared_msrs); 6946 free_percpu(shared_msrs);
6947out_free_x86_fpu_cache:
6948 kmem_cache_destroy(x86_fpu_cache);
6855out: 6949out:
6856 return r; 6950 return r;
6857} 6951}
@@ -6875,6 +6969,7 @@ void kvm_arch_exit(void)
6875 kvm_x86_ops = NULL; 6969 kvm_x86_ops = NULL;
6876 kvm_mmu_module_exit(); 6970 kvm_mmu_module_exit();
6877 free_percpu(shared_msrs); 6971 free_percpu(shared_msrs);
6972 kmem_cache_destroy(x86_fpu_cache);
6878} 6973}
6879 6974
6880int kvm_vcpu_halt(struct kvm_vcpu *vcpu) 6975int kvm_vcpu_halt(struct kvm_vcpu *vcpu)
@@ -7998,9 +8093,9 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
7998static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) 8093static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
7999{ 8094{
8000 preempt_disable(); 8095 preempt_disable();
8001 copy_fpregs_to_fpstate(&vcpu->arch.user_fpu); 8096 copy_fpregs_to_fpstate(&current->thread.fpu);
8002 /* PKRU is separately restored in kvm_x86_ops->run. */ 8097 /* PKRU is separately restored in kvm_x86_ops->run. */
8003 __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu.state, 8098 __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu->state,
8004 ~XFEATURE_MASK_PKRU); 8099 ~XFEATURE_MASK_PKRU);
8005 preempt_enable(); 8100 preempt_enable();
8006 trace_kvm_fpu(1); 8101 trace_kvm_fpu(1);
@@ -8010,8 +8105,8 @@ static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
8010static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) 8105static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
8011{ 8106{
8012 preempt_disable(); 8107 preempt_disable();
8013 copy_fpregs_to_fpstate(&vcpu->arch.guest_fpu); 8108 copy_fpregs_to_fpstate(vcpu->arch.guest_fpu);
8014 copy_kernel_to_fpregs(&vcpu->arch.user_fpu.state); 8109 copy_kernel_to_fpregs(&current->thread.fpu.state);
8015 preempt_enable(); 8110 preempt_enable();
8016 ++vcpu->stat.fpu_reload; 8111 ++vcpu->stat.fpu_reload;
8017 trace_kvm_fpu(0); 8112 trace_kvm_fpu(0);
@@ -8505,7 +8600,7 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
8505 8600
8506 vcpu_load(vcpu); 8601 vcpu_load(vcpu);
8507 8602
8508 fxsave = &vcpu->arch.guest_fpu.state.fxsave; 8603 fxsave = &vcpu->arch.guest_fpu->state.fxsave;
8509 memcpy(fpu->fpr, fxsave->st_space, 128); 8604 memcpy(fpu->fpr, fxsave->st_space, 128);
8510 fpu->fcw = fxsave->cwd; 8605 fpu->fcw = fxsave->cwd;
8511 fpu->fsw = fxsave->swd; 8606 fpu->fsw = fxsave->swd;
@@ -8525,7 +8620,7 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
8525 8620
8526 vcpu_load(vcpu); 8621 vcpu_load(vcpu);
8527 8622
8528 fxsave = &vcpu->arch.guest_fpu.state.fxsave; 8623 fxsave = &vcpu->arch.guest_fpu->state.fxsave;
8529 8624
8530 memcpy(fxsave->st_space, fpu->fpr, 128); 8625 memcpy(fxsave->st_space, fpu->fpr, 128);
8531 fxsave->cwd = fpu->fcw; 8626 fxsave->cwd = fpu->fcw;
@@ -8581,9 +8676,9 @@ static int sync_regs(struct kvm_vcpu *vcpu)
8581 8676
8582static void fx_init(struct kvm_vcpu *vcpu) 8677static void fx_init(struct kvm_vcpu *vcpu)
8583{ 8678{
8584 fpstate_init(&vcpu->arch.guest_fpu.state); 8679 fpstate_init(&vcpu->arch.guest_fpu->state);
8585 if (boot_cpu_has(X86_FEATURE_XSAVES)) 8680 if (boot_cpu_has(X86_FEATURE_XSAVES))
8586 vcpu->arch.guest_fpu.state.xsave.header.xcomp_bv = 8681 vcpu->arch.guest_fpu->state.xsave.header.xcomp_bv =
8587 host_xcr0 | XSTATE_COMPACTION_ENABLED; 8682 host_xcr0 | XSTATE_COMPACTION_ENABLED;
8588 8683
8589 /* 8684 /*
@@ -8621,6 +8716,7 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
8621 8716
8622int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) 8717int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
8623{ 8718{
8719 vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT;
8624 kvm_vcpu_mtrr_init(vcpu); 8720 kvm_vcpu_mtrr_init(vcpu);
8625 vcpu_load(vcpu); 8721 vcpu_load(vcpu);
8626 kvm_vcpu_reset(vcpu, false); 8722 kvm_vcpu_reset(vcpu, false);
@@ -8707,11 +8803,11 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
8707 */ 8803 */
8708 if (init_event) 8804 if (init_event)
8709 kvm_put_guest_fpu(vcpu); 8805 kvm_put_guest_fpu(vcpu);
8710 mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu.state.xsave, 8806 mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu->state.xsave,
8711 XFEATURE_MASK_BNDREGS); 8807 XFEATURE_MASK_BNDREGS);
8712 if (mpx_state_buffer) 8808 if (mpx_state_buffer)
8713 memset(mpx_state_buffer, 0, sizeof(struct mpx_bndreg_state)); 8809 memset(mpx_state_buffer, 0, sizeof(struct mpx_bndreg_state));
8714 mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu.state.xsave, 8810 mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu->state.xsave,
8715 XFEATURE_MASK_BNDCSR); 8811 XFEATURE_MASK_BNDCSR);
8716 if (mpx_state_buffer) 8812 if (mpx_state_buffer)
8717 memset(mpx_state_buffer, 0, sizeof(struct mpx_bndcsr)); 8813 memset(mpx_state_buffer, 0, sizeof(struct mpx_bndcsr));
@@ -8723,7 +8819,6 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
8723 kvm_pmu_reset(vcpu); 8819 kvm_pmu_reset(vcpu);
8724 vcpu->arch.smbase = 0x30000; 8820 vcpu->arch.smbase = 0x30000;
8725 8821
8726 vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT;
8727 vcpu->arch.msr_misc_features_enables = 0; 8822 vcpu->arch.msr_misc_features_enables = 0;
8728 8823
8729 vcpu->arch.xcr0 = XFEATURE_MASK_FP; 8824 vcpu->arch.xcr0 = XFEATURE_MASK_FP;
@@ -9282,7 +9377,7 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
9282 * with dirty logging disabled in order to eliminate unnecessary GPA 9377 * with dirty logging disabled in order to eliminate unnecessary GPA
9283 * logging in PML buffer (and potential PML buffer full VMEXT). This 9378 * logging in PML buffer (and potential PML buffer full VMEXT). This
9284 * guarantees leaving PML enabled during guest's lifetime won't have 9379 * guarantees leaving PML enabled during guest's lifetime won't have
9285 * any additonal overhead from PML when guest is running with dirty 9380 * any additional overhead from PML when guest is running with dirty
9286 * logging disabled for memory slots. 9381 * logging disabled for memory slots.
9287 * 9382 *
9288 * kvm_x86_ops->slot_enable_log_dirty is called when switching new slot 9383 * kvm_x86_ops->slot_enable_log_dirty is called when switching new slot
diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c
index 332d7c34be5c..11273cd384d6 100644
--- a/drivers/hv/hv.c
+++ b/drivers/hv/hv.c
@@ -143,7 +143,7 @@ static int hv_ce_shutdown(struct clock_event_device *evt)
143 143
144static int hv_ce_set_oneshot(struct clock_event_device *evt) 144static int hv_ce_set_oneshot(struct clock_event_device *evt)
145{ 145{
146 union hv_timer_config timer_cfg; 146 union hv_stimer_config timer_cfg;
147 147
148 timer_cfg.as_uint64 = 0; 148 timer_cfg.as_uint64 = 0;
149 timer_cfg.enable = 1; 149 timer_cfg.enable = 1;
diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h
index 87d3d7da78f8..ea201034b248 100644
--- a/drivers/hv/hyperv_vmbus.h
+++ b/drivers/hv/hyperv_vmbus.h
@@ -44,74 +44,6 @@
44 */ 44 */
45#define HV_UTIL_NEGO_TIMEOUT 55 45#define HV_UTIL_NEGO_TIMEOUT 55
46 46
47/* Define synthetic interrupt controller flag constants. */
48#define HV_EVENT_FLAGS_COUNT (256 * 8)
49#define HV_EVENT_FLAGS_LONG_COUNT (256 / sizeof(unsigned long))
50
51/*
52 * Timer configuration register.
53 */
54union hv_timer_config {
55 u64 as_uint64;
56 struct {
57 u64 enable:1;
58 u64 periodic:1;
59 u64 lazy:1;
60 u64 auto_enable:1;
61 u64 apic_vector:8;
62 u64 direct_mode:1;
63 u64 reserved_z0:3;
64 u64 sintx:4;
65 u64 reserved_z1:44;
66 };
67};
68
69
70/* Define the synthetic interrupt controller event flags format. */
71union hv_synic_event_flags {
72 unsigned long flags[HV_EVENT_FLAGS_LONG_COUNT];
73};
74
75/* Define SynIC control register. */
76union hv_synic_scontrol {
77 u64 as_uint64;
78 struct {
79 u64 enable:1;
80 u64 reserved:63;
81 };
82};
83
84/* Define synthetic interrupt source. */
85union hv_synic_sint {
86 u64 as_uint64;
87 struct {
88 u64 vector:8;
89 u64 reserved1:8;
90 u64 masked:1;
91 u64 auto_eoi:1;
92 u64 reserved2:46;
93 };
94};
95
96/* Define the format of the SIMP register */
97union hv_synic_simp {
98 u64 as_uint64;
99 struct {
100 u64 simp_enabled:1;
101 u64 preserved:11;
102 u64 base_simp_gpa:52;
103 };
104};
105
106/* Define the format of the SIEFP register */
107union hv_synic_siefp {
108 u64 as_uint64;
109 struct {
110 u64 siefp_enabled:1;
111 u64 preserved:11;
112 u64 base_siefp_gpa:52;
113 };
114};
115 47
116/* Definitions for the monitored notification facility */ 48/* Definitions for the monitored notification facility */
117union hv_monitor_trigger_group { 49union hv_monitor_trigger_group {
diff --git a/include/kvm/arm_arch_timer.h b/include/kvm/arm_arch_timer.h
index 6502feb9524b..33771352dcd6 100644
--- a/include/kvm/arm_arch_timer.h
+++ b/include/kvm/arm_arch_timer.h
@@ -21,7 +21,6 @@
21 21
22#include <linux/clocksource.h> 22#include <linux/clocksource.h>
23#include <linux/hrtimer.h> 23#include <linux/hrtimer.h>
24#include <linux/workqueue.h>
25 24
26struct arch_timer_context { 25struct arch_timer_context {
27 /* Registers: control register, timer value */ 26 /* Registers: control register, timer value */
@@ -52,9 +51,6 @@ struct arch_timer_cpu {
52 /* Background timer used when the guest is not running */ 51 /* Background timer used when the guest is not running */
53 struct hrtimer bg_timer; 52 struct hrtimer bg_timer;
54 53
55 /* Work queued with the above timer expires */
56 struct work_struct expired;
57
58 /* Physical timer emulation */ 54 /* Physical timer emulation */
59 struct hrtimer phys_timer; 55 struct hrtimer phys_timer;
60 56
diff --git a/include/linux/compiler_attributes.h b/include/linux/compiler_attributes.h
index f8c400ba1929..fe07b680dd4a 100644
--- a/include/linux/compiler_attributes.h
+++ b/include/linux/compiler_attributes.h
@@ -37,7 +37,6 @@
37# define __GCC4_has_attribute___designated_init__ 0 37# define __GCC4_has_attribute___designated_init__ 0
38# define __GCC4_has_attribute___externally_visible__ 1 38# define __GCC4_has_attribute___externally_visible__ 1
39# define __GCC4_has_attribute___noclone__ 1 39# define __GCC4_has_attribute___noclone__ 1
40# define __GCC4_has_attribute___optimize__ 1
41# define __GCC4_has_attribute___nonstring__ 0 40# define __GCC4_has_attribute___nonstring__ 0
42# define __GCC4_has_attribute___no_sanitize_address__ (__GNUC_MINOR__ >= 8) 41# define __GCC4_has_attribute___no_sanitize_address__ (__GNUC_MINOR__ >= 8)
43#endif 42#endif
@@ -163,17 +162,11 @@
163 162
164/* 163/*
165 * Optional: not supported by clang 164 * Optional: not supported by clang
166 * Note: icc does not recognize gcc's no-tracer
167 * 165 *
168 * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-noclone-function-attribute 166 * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-noclone-function-attribute
169 * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-optimize-function-attribute
170 */ 167 */
171#if __has_attribute(__noclone__) 168#if __has_attribute(__noclone__)
172# if __has_attribute(__optimize__) 169# define __noclone __attribute__((__noclone__))
173# define __noclone __attribute__((__noclone__, __optimize__("no-tracer")))
174# else
175# define __noclone __attribute__((__noclone__))
176# endif
177#else 170#else
178# define __noclone 171# define __noclone
179#endif 172#endif
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index c926698040e0..c38cc5eb7e73 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -449,6 +449,7 @@ struct kvm {
449#endif 449#endif
450 long tlbs_dirty; 450 long tlbs_dirty;
451 struct list_head devices; 451 struct list_head devices;
452 bool manual_dirty_log_protect;
452 struct dentry *debugfs_dentry; 453 struct dentry *debugfs_dentry;
453 struct kvm_stat_data **debugfs_stat_data; 454 struct kvm_stat_data **debugfs_stat_data;
454 struct srcu_struct srcu; 455 struct srcu_struct srcu;
@@ -694,7 +695,8 @@ int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
694int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, 695int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
695 void *data, unsigned long len); 696 void *data, unsigned long len);
696int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, 697int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
697 void *data, int offset, unsigned long len); 698 void *data, unsigned int offset,
699 unsigned long len);
698int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, 700int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
699 gpa_t gpa, unsigned long len); 701 gpa_t gpa, unsigned long len);
700int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len); 702int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len);
@@ -753,7 +755,9 @@ int kvm_get_dirty_log(struct kvm *kvm,
753 struct kvm_dirty_log *log, int *is_dirty); 755 struct kvm_dirty_log *log, int *is_dirty);
754 756
755int kvm_get_dirty_log_protect(struct kvm *kvm, 757int kvm_get_dirty_log_protect(struct kvm *kvm,
756 struct kvm_dirty_log *log, bool *is_dirty); 758 struct kvm_dirty_log *log, bool *flush);
759int kvm_clear_dirty_log_protect(struct kvm *kvm,
760 struct kvm_clear_dirty_log *log, bool *flush);
757 761
758void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, 762void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
759 struct kvm_memory_slot *slot, 763 struct kvm_memory_slot *slot,
@@ -762,9 +766,13 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
762 766
763int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, 767int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
764 struct kvm_dirty_log *log); 768 struct kvm_dirty_log *log);
769int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm,
770 struct kvm_clear_dirty_log *log);
765 771
766int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level, 772int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level,
767 bool line_status); 773 bool line_status);
774int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
775 struct kvm_enable_cap *cap);
768long kvm_arch_vm_ioctl(struct file *filp, 776long kvm_arch_vm_ioctl(struct file *filp,
769 unsigned int ioctl, unsigned long arg); 777 unsigned int ioctl, unsigned long arg);
770 778
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 2b7a652c9fa4..6d4ea4b6c922 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -492,6 +492,17 @@ struct kvm_dirty_log {
492 }; 492 };
493}; 493};
494 494
495/* for KVM_CLEAR_DIRTY_LOG */
496struct kvm_clear_dirty_log {
497 __u32 slot;
498 __u32 num_pages;
499 __u64 first_page;
500 union {
501 void __user *dirty_bitmap; /* one bit per page */
502 __u64 padding2;
503 };
504};
505
495/* for KVM_SET_SIGNAL_MASK */ 506/* for KVM_SET_SIGNAL_MASK */
496struct kvm_signal_mask { 507struct kvm_signal_mask {
497 __u32 len; 508 __u32 len;
@@ -975,6 +986,8 @@ struct kvm_ppc_resize_hpt {
975#define KVM_CAP_HYPERV_ENLIGHTENED_VMCS 163 986#define KVM_CAP_HYPERV_ENLIGHTENED_VMCS 163
976#define KVM_CAP_EXCEPTION_PAYLOAD 164 987#define KVM_CAP_EXCEPTION_PAYLOAD 164
977#define KVM_CAP_ARM_VM_IPA_SIZE 165 988#define KVM_CAP_ARM_VM_IPA_SIZE 165
989#define KVM_CAP_MANUAL_DIRTY_LOG_PROTECT 166
990#define KVM_CAP_HYPERV_CPUID 167
978 991
979#ifdef KVM_CAP_IRQ_ROUTING 992#ifdef KVM_CAP_IRQ_ROUTING
980 993
@@ -1421,6 +1434,12 @@ struct kvm_enc_region {
1421#define KVM_GET_NESTED_STATE _IOWR(KVMIO, 0xbe, struct kvm_nested_state) 1434#define KVM_GET_NESTED_STATE _IOWR(KVMIO, 0xbe, struct kvm_nested_state)
1422#define KVM_SET_NESTED_STATE _IOW(KVMIO, 0xbf, struct kvm_nested_state) 1435#define KVM_SET_NESTED_STATE _IOW(KVMIO, 0xbf, struct kvm_nested_state)
1423 1436
1437/* Available with KVM_CAP_MANUAL_DIRTY_LOG_PROTECT */
1438#define KVM_CLEAR_DIRTY_LOG _IOWR(KVMIO, 0xc0, struct kvm_clear_dirty_log)
1439
1440/* Available with KVM_CAP_HYPERV_CPUID */
1441#define KVM_GET_SUPPORTED_HV_CPUID _IOWR(KVMIO, 0xc1, struct kvm_cpuid2)
1442
1424/* Secure Encrypted Virtualization command */ 1443/* Secure Encrypted Virtualization command */
1425enum sev_cmd_id { 1444enum sev_cmd_id {
1426 /* Guest initialization commands */ 1445 /* Guest initialization commands */
diff --git a/tools/kvm/kvm_stat/kvm_stat b/tools/kvm/kvm_stat/kvm_stat
index 195ba486640f..2ed395b817cb 100755
--- a/tools/kvm/kvm_stat/kvm_stat
+++ b/tools/kvm/kvm_stat/kvm_stat
@@ -1,4 +1,4 @@
1#!/usr/bin/python 1#!/usr/bin/env python3
2# 2#
3# top-like utility for displaying kvm statistics 3# top-like utility for displaying kvm statistics
4# 4#
diff --git a/tools/testing/selftests/android/Makefile b/tools/testing/selftests/android/Makefile
index d9a725478375..72c25a3cb658 100644
--- a/tools/testing/selftests/android/Makefile
+++ b/tools/testing/selftests/android/Makefile
@@ -6,7 +6,7 @@ TEST_PROGS := run.sh
6 6
7include ../lib.mk 7include ../lib.mk
8 8
9all: khdr 9all:
10 @for DIR in $(SUBDIRS); do \ 10 @for DIR in $(SUBDIRS); do \
11 BUILD_TARGET=$(OUTPUT)/$$DIR; \ 11 BUILD_TARGET=$(OUTPUT)/$$DIR; \
12 mkdir $$BUILD_TARGET -p; \ 12 mkdir $$BUILD_TARGET -p; \
diff --git a/tools/testing/selftests/futex/functional/Makefile b/tools/testing/selftests/futex/functional/Makefile
index ad1eeb14fda7..30996306cabc 100644
--- a/tools/testing/selftests/futex/functional/Makefile
+++ b/tools/testing/selftests/futex/functional/Makefile
@@ -19,6 +19,7 @@ TEST_GEN_FILES := \
19TEST_PROGS := run.sh 19TEST_PROGS := run.sh
20 20
21top_srcdir = ../../../../.. 21top_srcdir = ../../../../..
22KSFT_KHDR_INSTALL := 1
22include ../../lib.mk 23include ../../lib.mk
23 24
24$(TEST_GEN_FILES): $(HEADERS) 25$(TEST_GEN_FILES): $(HEADERS)
diff --git a/tools/testing/selftests/gpio/Makefile b/tools/testing/selftests/gpio/Makefile
index 46648427d537..07f572a1bd3f 100644
--- a/tools/testing/selftests/gpio/Makefile
+++ b/tools/testing/selftests/gpio/Makefile
@@ -10,8 +10,6 @@ TEST_PROGS_EXTENDED := gpio-mockup-chardev
10GPIODIR := $(realpath ../../../gpio) 10GPIODIR := $(realpath ../../../gpio)
11GPIOOBJ := gpio-utils.o 11GPIOOBJ := gpio-utils.o
12 12
13include ../lib.mk
14
15all: $(TEST_PROGS_EXTENDED) 13all: $(TEST_PROGS_EXTENDED)
16 14
17override define CLEAN 15override define CLEAN
@@ -19,7 +17,9 @@ override define CLEAN
19 $(MAKE) -C $(GPIODIR) OUTPUT=$(GPIODIR)/ clean 17 $(MAKE) -C $(GPIODIR) OUTPUT=$(GPIODIR)/ clean
20endef 18endef
21 19
22$(TEST_PROGS_EXTENDED):| khdr 20KSFT_KHDR_INSTALL := 1
21include ../lib.mk
22
23$(TEST_PROGS_EXTENDED): $(GPIODIR)/$(GPIOOBJ) 23$(TEST_PROGS_EXTENDED): $(GPIODIR)/$(GPIOOBJ)
24 24
25$(GPIODIR)/$(GPIOOBJ): 25$(GPIODIR)/$(GPIOOBJ):
diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
index 01a219229238..f9a0e9938480 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -1,6 +1,7 @@
1all: 1all:
2 2
3top_srcdir = ../../../.. 3top_srcdir = ../../../..
4KSFT_KHDR_INSTALL := 1
4UNAME_M := $(shell uname -m) 5UNAME_M := $(shell uname -m)
5 6
6LIBKVM = lib/assert.c lib/elf.c lib/io.c lib/kvm_util.c lib/ucall.c lib/sparsebit.c 7LIBKVM = lib/assert.c lib/elf.c lib/io.c lib/kvm_util.c lib/ucall.c lib/sparsebit.c
@@ -14,9 +15,12 @@ TEST_GEN_PROGS_x86_64 += x86_64/vmx_tsc_adjust_test
14TEST_GEN_PROGS_x86_64 += x86_64/cr4_cpuid_sync_test 15TEST_GEN_PROGS_x86_64 += x86_64/cr4_cpuid_sync_test
15TEST_GEN_PROGS_x86_64 += x86_64/state_test 16TEST_GEN_PROGS_x86_64 += x86_64/state_test
16TEST_GEN_PROGS_x86_64 += x86_64/evmcs_test 17TEST_GEN_PROGS_x86_64 += x86_64/evmcs_test
18TEST_GEN_PROGS_x86_64 += x86_64/hyperv_cpuid
17TEST_GEN_PROGS_x86_64 += dirty_log_test 19TEST_GEN_PROGS_x86_64 += dirty_log_test
20TEST_GEN_PROGS_x86_64 += clear_dirty_log_test
18 21
19TEST_GEN_PROGS_aarch64 += dirty_log_test 22TEST_GEN_PROGS_aarch64 += dirty_log_test
23TEST_GEN_PROGS_aarch64 += clear_dirty_log_test
20 24
21TEST_GEN_PROGS += $(TEST_GEN_PROGS_$(UNAME_M)) 25TEST_GEN_PROGS += $(TEST_GEN_PROGS_$(UNAME_M))
22LIBKVM += $(LIBKVM_$(UNAME_M)) 26LIBKVM += $(LIBKVM_$(UNAME_M))
@@ -44,7 +48,6 @@ $(OUTPUT)/libkvm.a: $(LIBKVM_OBJ)
44 48
45all: $(STATIC_LIBS) 49all: $(STATIC_LIBS)
46$(TEST_GEN_PROGS): $(STATIC_LIBS) 50$(TEST_GEN_PROGS): $(STATIC_LIBS)
47$(STATIC_LIBS):| khdr
48 51
49cscope: include_paths = $(LINUX_TOOL_INCLUDE) $(LINUX_HDR_PATH) include lib .. 52cscope: include_paths = $(LINUX_TOOL_INCLUDE) $(LINUX_HDR_PATH) include lib ..
50cscope: 53cscope:
diff --git a/tools/testing/selftests/kvm/clear_dirty_log_test.c b/tools/testing/selftests/kvm/clear_dirty_log_test.c
new file mode 100644
index 000000000000..749336937d37
--- /dev/null
+++ b/tools/testing/selftests/kvm/clear_dirty_log_test.c
@@ -0,0 +1,2 @@
1#define USE_CLEAR_DIRTY_LOG
2#include "dirty_log_test.c"
diff --git a/tools/testing/selftests/kvm/dirty_log_test.c b/tools/testing/selftests/kvm/dirty_log_test.c
index aeff95a91b15..4715cfba20dc 100644
--- a/tools/testing/selftests/kvm/dirty_log_test.c
+++ b/tools/testing/selftests/kvm/dirty_log_test.c
@@ -51,10 +51,17 @@ static uint64_t random_array[TEST_PAGES_PER_LOOP];
51static uint64_t iteration; 51static uint64_t iteration;
52 52
53/* 53/*
54 * GPA offset of the testing memory slot. Must be bigger than 54 * Guest physical memory offset of the testing memory slot.
55 * DEFAULT_GUEST_PHY_PAGES. 55 * This will be set to the topmost valid physical address minus
56 * the test memory size.
56 */ 57 */
57static uint64_t guest_test_mem = DEFAULT_GUEST_TEST_MEM; 58static uint64_t guest_test_phys_mem;
59
60/*
61 * Guest virtual memory offset of the testing memory slot.
62 * Must not conflict with identity mapped test code.
63 */
64static uint64_t guest_test_virt_mem = DEFAULT_GUEST_TEST_MEM;
58 65
59/* 66/*
60 * Continuously write to the first 8 bytes of a random pages within 67 * Continuously write to the first 8 bytes of a random pages within
@@ -66,7 +73,7 @@ static void guest_code(void)
66 73
67 while (true) { 74 while (true) {
68 for (i = 0; i < TEST_PAGES_PER_LOOP; i++) { 75 for (i = 0; i < TEST_PAGES_PER_LOOP; i++) {
69 uint64_t addr = guest_test_mem; 76 uint64_t addr = guest_test_virt_mem;
70 addr += (READ_ONCE(random_array[i]) % guest_num_pages) 77 addr += (READ_ONCE(random_array[i]) % guest_num_pages)
71 * guest_page_size; 78 * guest_page_size;
72 addr &= ~(host_page_size - 1); 79 addr &= ~(host_page_size - 1);
@@ -209,12 +216,14 @@ static void vm_dirty_log_verify(unsigned long *bmap)
209} 216}
210 217
211static struct kvm_vm *create_vm(enum vm_guest_mode mode, uint32_t vcpuid, 218static struct kvm_vm *create_vm(enum vm_guest_mode mode, uint32_t vcpuid,
212 uint64_t extra_mem_pages, void *guest_code) 219 uint64_t extra_mem_pages, void *guest_code,
220 unsigned long type)
213{ 221{
214 struct kvm_vm *vm; 222 struct kvm_vm *vm;
215 uint64_t extra_pg_pages = extra_mem_pages / 512 * 2; 223 uint64_t extra_pg_pages = extra_mem_pages / 512 * 2;
216 224
217 vm = vm_create(mode, DEFAULT_GUEST_PHY_PAGES + extra_pg_pages, O_RDWR); 225 vm = _vm_create(mode, DEFAULT_GUEST_PHY_PAGES + extra_pg_pages,
226 O_RDWR, type);
218 kvm_vm_elf_load(vm, program_invocation_name, 0, 0); 227 kvm_vm_elf_load(vm, program_invocation_name, 0, 0);
219#ifdef __x86_64__ 228#ifdef __x86_64__
220 vm_create_irqchip(vm); 229 vm_create_irqchip(vm);
@@ -224,13 +233,14 @@ static struct kvm_vm *create_vm(enum vm_guest_mode mode, uint32_t vcpuid,
224} 233}
225 234
226static void run_test(enum vm_guest_mode mode, unsigned long iterations, 235static void run_test(enum vm_guest_mode mode, unsigned long iterations,
227 unsigned long interval, bool top_offset) 236 unsigned long interval, uint64_t phys_offset)
228{ 237{
229 unsigned int guest_pa_bits, guest_page_shift; 238 unsigned int guest_pa_bits, guest_page_shift;
230 pthread_t vcpu_thread; 239 pthread_t vcpu_thread;
231 struct kvm_vm *vm; 240 struct kvm_vm *vm;
232 uint64_t max_gfn; 241 uint64_t max_gfn;
233 unsigned long *bmap; 242 unsigned long *bmap;
243 unsigned long type = 0;
234 244
235 switch (mode) { 245 switch (mode) {
236 case VM_MODE_P52V48_4K: 246 case VM_MODE_P52V48_4K:
@@ -241,6 +251,14 @@ static void run_test(enum vm_guest_mode mode, unsigned long iterations,
241 guest_pa_bits = 52; 251 guest_pa_bits = 52;
242 guest_page_shift = 16; 252 guest_page_shift = 16;
243 break; 253 break;
254 case VM_MODE_P48V48_4K:
255 guest_pa_bits = 48;
256 guest_page_shift = 12;
257 break;
258 case VM_MODE_P48V48_64K:
259 guest_pa_bits = 48;
260 guest_page_shift = 16;
261 break;
244 case VM_MODE_P40V48_4K: 262 case VM_MODE_P40V48_4K:
245 guest_pa_bits = 40; 263 guest_pa_bits = 40;
246 guest_page_shift = 12; 264 guest_page_shift = 12;
@@ -255,6 +273,19 @@ static void run_test(enum vm_guest_mode mode, unsigned long iterations,
255 273
256 DEBUG("Testing guest mode: %s\n", vm_guest_mode_string(mode)); 274 DEBUG("Testing guest mode: %s\n", vm_guest_mode_string(mode));
257 275
276#ifdef __x86_64__
277 /*
278 * FIXME
279 * The x86_64 kvm selftests framework currently only supports a
280 * single PML4 which restricts the number of physical address
281 * bits we can change to 39.
282 */
283 guest_pa_bits = 39;
284#endif
285#ifdef __aarch64__
286 if (guest_pa_bits != 40)
287 type = KVM_VM_TYPE_ARM_IPA_SIZE(guest_pa_bits);
288#endif
258 max_gfn = (1ul << (guest_pa_bits - guest_page_shift)) - 1; 289 max_gfn = (1ul << (guest_pa_bits - guest_page_shift)) - 1;
259 guest_page_size = (1ul << guest_page_shift); 290 guest_page_size = (1ul << guest_page_shift);
260 /* 1G of guest page sized pages */ 291 /* 1G of guest page sized pages */
@@ -263,31 +294,41 @@ static void run_test(enum vm_guest_mode mode, unsigned long iterations,
263 host_num_pages = (guest_num_pages * guest_page_size) / host_page_size + 294 host_num_pages = (guest_num_pages * guest_page_size) / host_page_size +
264 !!((guest_num_pages * guest_page_size) % host_page_size); 295 !!((guest_num_pages * guest_page_size) % host_page_size);
265 296
266 if (top_offset) { 297 if (!phys_offset) {
267 guest_test_mem = (max_gfn - guest_num_pages) * guest_page_size; 298 guest_test_phys_mem = (max_gfn - guest_num_pages) * guest_page_size;
268 guest_test_mem &= ~(host_page_size - 1); 299 guest_test_phys_mem &= ~(host_page_size - 1);
300 } else {
301 guest_test_phys_mem = phys_offset;
269 } 302 }
270 303
271 DEBUG("guest test mem offset: 0x%lx\n", guest_test_mem); 304 DEBUG("guest physical test memory offset: 0x%lx\n", guest_test_phys_mem);
272 305
273 bmap = bitmap_alloc(host_num_pages); 306 bmap = bitmap_alloc(host_num_pages);
274 host_bmap_track = bitmap_alloc(host_num_pages); 307 host_bmap_track = bitmap_alloc(host_num_pages);
275 308
276 vm = create_vm(mode, VCPU_ID, guest_num_pages, guest_code); 309 vm = create_vm(mode, VCPU_ID, guest_num_pages, guest_code, type);
310
311#ifdef USE_CLEAR_DIRTY_LOG
312 struct kvm_enable_cap cap = {};
313
314 cap.cap = KVM_CAP_MANUAL_DIRTY_LOG_PROTECT;
315 cap.args[0] = 1;
316 vm_enable_cap(vm, &cap);
317#endif
277 318
278 /* Add an extra memory slot for testing dirty logging */ 319 /* Add an extra memory slot for testing dirty logging */
279 vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, 320 vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS,
280 guest_test_mem, 321 guest_test_phys_mem,
281 TEST_MEM_SLOT_INDEX, 322 TEST_MEM_SLOT_INDEX,
282 guest_num_pages, 323 guest_num_pages,
283 KVM_MEM_LOG_DIRTY_PAGES); 324 KVM_MEM_LOG_DIRTY_PAGES);
284 325
285 /* Do 1:1 mapping for the dirty track memory slot */ 326 /* Do mapping for the dirty track memory slot */
286 virt_map(vm, guest_test_mem, guest_test_mem, 327 virt_map(vm, guest_test_virt_mem, guest_test_phys_mem,
287 guest_num_pages * guest_page_size, 0); 328 guest_num_pages * guest_page_size, 0);
288 329
289 /* Cache the HVA pointer of the region */ 330 /* Cache the HVA pointer of the region */
290 host_test_mem = addr_gpa2hva(vm, (vm_paddr_t)guest_test_mem); 331 host_test_mem = addr_gpa2hva(vm, (vm_paddr_t)guest_test_phys_mem);
291 332
292#ifdef __x86_64__ 333#ifdef __x86_64__
293 vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid()); 334 vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
@@ -299,7 +340,7 @@ static void run_test(enum vm_guest_mode mode, unsigned long iterations,
299 /* Export the shared variables to the guest */ 340 /* Export the shared variables to the guest */
300 sync_global_to_guest(vm, host_page_size); 341 sync_global_to_guest(vm, host_page_size);
301 sync_global_to_guest(vm, guest_page_size); 342 sync_global_to_guest(vm, guest_page_size);
302 sync_global_to_guest(vm, guest_test_mem); 343 sync_global_to_guest(vm, guest_test_virt_mem);
303 sync_global_to_guest(vm, guest_num_pages); 344 sync_global_to_guest(vm, guest_num_pages);
304 345
305 /* Start the iterations */ 346 /* Start the iterations */
@@ -316,6 +357,10 @@ static void run_test(enum vm_guest_mode mode, unsigned long iterations,
316 /* Give the vcpu thread some time to dirty some pages */ 357 /* Give the vcpu thread some time to dirty some pages */
317 usleep(interval * 1000); 358 usleep(interval * 1000);
318 kvm_vm_get_dirty_log(vm, TEST_MEM_SLOT_INDEX, bmap); 359 kvm_vm_get_dirty_log(vm, TEST_MEM_SLOT_INDEX, bmap);
360#ifdef USE_CLEAR_DIRTY_LOG
361 kvm_vm_clear_dirty_log(vm, TEST_MEM_SLOT_INDEX, bmap, 0,
362 DIV_ROUND_UP(host_num_pages, 64) * 64);
363#endif
319 vm_dirty_log_verify(bmap); 364 vm_dirty_log_verify(bmap);
320 iteration++; 365 iteration++;
321 sync_global_to_guest(vm, iteration); 366 sync_global_to_guest(vm, iteration);
@@ -335,23 +380,16 @@ static void run_test(enum vm_guest_mode mode, unsigned long iterations,
335 kvm_vm_free(vm); 380 kvm_vm_free(vm);
336} 381}
337 382
338static struct vm_guest_modes { 383struct vm_guest_mode_params {
339 enum vm_guest_mode mode;
340 bool supported; 384 bool supported;
341 bool enabled; 385 bool enabled;
342} vm_guest_modes[NUM_VM_MODES] = {
343#if defined(__x86_64__)
344 { VM_MODE_P52V48_4K, 1, 1, },
345 { VM_MODE_P52V48_64K, 0, 0, },
346 { VM_MODE_P40V48_4K, 0, 0, },
347 { VM_MODE_P40V48_64K, 0, 0, },
348#elif defined(__aarch64__)
349 { VM_MODE_P52V48_4K, 0, 0, },
350 { VM_MODE_P52V48_64K, 0, 0, },
351 { VM_MODE_P40V48_4K, 1, 1, },
352 { VM_MODE_P40V48_64K, 1, 1, },
353#endif
354}; 386};
387struct vm_guest_mode_params vm_guest_mode_params[NUM_VM_MODES];
388
389#define vm_guest_mode_params_init(mode, supported, enabled) \
390({ \
391 vm_guest_mode_params[mode] = (struct vm_guest_mode_params){ supported, enabled }; \
392})
355 393
356static void help(char *name) 394static void help(char *name)
357{ 395{
@@ -359,25 +397,21 @@ static void help(char *name)
359 397
360 puts(""); 398 puts("");
361 printf("usage: %s [-h] [-i iterations] [-I interval] " 399 printf("usage: %s [-h] [-i iterations] [-I interval] "
362 "[-o offset] [-t] [-m mode]\n", name); 400 "[-p offset] [-m mode]\n", name);
363 puts(""); 401 puts("");
364 printf(" -i: specify iteration counts (default: %"PRIu64")\n", 402 printf(" -i: specify iteration counts (default: %"PRIu64")\n",
365 TEST_HOST_LOOP_N); 403 TEST_HOST_LOOP_N);
366 printf(" -I: specify interval in ms (default: %"PRIu64" ms)\n", 404 printf(" -I: specify interval in ms (default: %"PRIu64" ms)\n",
367 TEST_HOST_LOOP_INTERVAL); 405 TEST_HOST_LOOP_INTERVAL);
368 printf(" -o: guest test memory offset (default: 0x%lx)\n", 406 printf(" -p: specify guest physical test memory offset\n"
369 DEFAULT_GUEST_TEST_MEM); 407 " Warning: a low offset can conflict with the loaded test code.\n");
370 printf(" -t: map guest test memory at the top of the allowed "
371 "physical address range\n");
372 printf(" -m: specify the guest mode ID to test " 408 printf(" -m: specify the guest mode ID to test "
373 "(default: test all supported modes)\n" 409 "(default: test all supported modes)\n"
374 " This option may be used multiple times.\n" 410 " This option may be used multiple times.\n"
375 " Guest mode IDs:\n"); 411 " Guest mode IDs:\n");
376 for (i = 0; i < NUM_VM_MODES; ++i) { 412 for (i = 0; i < NUM_VM_MODES; ++i) {
377 printf(" %d: %s%s\n", 413 printf(" %d: %s%s\n", i, vm_guest_mode_string(i),
378 vm_guest_modes[i].mode, 414 vm_guest_mode_params[i].supported ? " (supported)" : "");
379 vm_guest_mode_string(vm_guest_modes[i].mode),
380 vm_guest_modes[i].supported ? " (supported)" : "");
381 } 415 }
382 puts(""); 416 puts("");
383 exit(0); 417 exit(0);
@@ -388,11 +422,34 @@ int main(int argc, char *argv[])
388 unsigned long iterations = TEST_HOST_LOOP_N; 422 unsigned long iterations = TEST_HOST_LOOP_N;
389 unsigned long interval = TEST_HOST_LOOP_INTERVAL; 423 unsigned long interval = TEST_HOST_LOOP_INTERVAL;
390 bool mode_selected = false; 424 bool mode_selected = false;
391 bool top_offset = false; 425 uint64_t phys_offset = 0;
392 unsigned int mode; 426 unsigned int mode, host_ipa_limit;
393 int opt, i; 427 int opt, i;
394 428
395 while ((opt = getopt(argc, argv, "hi:I:o:tm:")) != -1) { 429#ifdef USE_CLEAR_DIRTY_LOG
430 if (!kvm_check_cap(KVM_CAP_MANUAL_DIRTY_LOG_PROTECT)) {
431 fprintf(stderr, "KVM_CLEAR_DIRTY_LOG not available, skipping tests\n");
432 exit(KSFT_SKIP);
433 }
434#endif
435
436#ifdef __x86_64__
437 vm_guest_mode_params_init(VM_MODE_P52V48_4K, true, true);
438#endif
439#ifdef __aarch64__
440 vm_guest_mode_params_init(VM_MODE_P40V48_4K, true, true);
441 vm_guest_mode_params_init(VM_MODE_P40V48_64K, true, true);
442
443 host_ipa_limit = kvm_check_cap(KVM_CAP_ARM_VM_IPA_SIZE);
444 if (host_ipa_limit >= 52)
445 vm_guest_mode_params_init(VM_MODE_P52V48_64K, true, true);
446 if (host_ipa_limit >= 48) {
447 vm_guest_mode_params_init(VM_MODE_P48V48_4K, true, true);
448 vm_guest_mode_params_init(VM_MODE_P48V48_64K, true, true);
449 }
450#endif
451
452 while ((opt = getopt(argc, argv, "hi:I:p:m:")) != -1) {
396 switch (opt) { 453 switch (opt) {
397 case 'i': 454 case 'i':
398 iterations = strtol(optarg, NULL, 10); 455 iterations = strtol(optarg, NULL, 10);
@@ -400,22 +457,19 @@ int main(int argc, char *argv[])
400 case 'I': 457 case 'I':
401 interval = strtol(optarg, NULL, 10); 458 interval = strtol(optarg, NULL, 10);
402 break; 459 break;
403 case 'o': 460 case 'p':
404 guest_test_mem = strtoull(optarg, NULL, 0); 461 phys_offset = strtoull(optarg, NULL, 0);
405 break;
406 case 't':
407 top_offset = true;
408 break; 462 break;
409 case 'm': 463 case 'm':
410 if (!mode_selected) { 464 if (!mode_selected) {
411 for (i = 0; i < NUM_VM_MODES; ++i) 465 for (i = 0; i < NUM_VM_MODES; ++i)
412 vm_guest_modes[i].enabled = 0; 466 vm_guest_mode_params[i].enabled = false;
413 mode_selected = true; 467 mode_selected = true;
414 } 468 }
415 mode = strtoul(optarg, NULL, 10); 469 mode = strtoul(optarg, NULL, 10);
416 TEST_ASSERT(mode < NUM_VM_MODES, 470 TEST_ASSERT(mode < NUM_VM_MODES,
417 "Guest mode ID %d too big", mode); 471 "Guest mode ID %d too big", mode);
418 vm_guest_modes[mode].enabled = 1; 472 vm_guest_mode_params[mode].enabled = true;
419 break; 473 break;
420 case 'h': 474 case 'h':
421 default: 475 default:
@@ -426,8 +480,6 @@ int main(int argc, char *argv[])
426 480
427 TEST_ASSERT(iterations > 2, "Iterations must be greater than two"); 481 TEST_ASSERT(iterations > 2, "Iterations must be greater than two");
428 TEST_ASSERT(interval > 0, "Interval must be greater than zero"); 482 TEST_ASSERT(interval > 0, "Interval must be greater than zero");
429 TEST_ASSERT(!top_offset || guest_test_mem == DEFAULT_GUEST_TEST_MEM,
430 "Cannot use both -o [offset] and -t at the same time");
431 483
432 DEBUG("Test iterations: %"PRIu64", interval: %"PRIu64" (ms)\n", 484 DEBUG("Test iterations: %"PRIu64", interval: %"PRIu64" (ms)\n",
433 iterations, interval); 485 iterations, interval);
@@ -435,13 +487,12 @@ int main(int argc, char *argv[])
435 srandom(time(0)); 487 srandom(time(0));
436 488
437 for (i = 0; i < NUM_VM_MODES; ++i) { 489 for (i = 0; i < NUM_VM_MODES; ++i) {
438 if (!vm_guest_modes[i].enabled) 490 if (!vm_guest_mode_params[i].enabled)
439 continue; 491 continue;
440 TEST_ASSERT(vm_guest_modes[i].supported, 492 TEST_ASSERT(vm_guest_mode_params[i].supported,
441 "Guest mode ID %d (%s) not supported.", 493 "Guest mode ID %d (%s) not supported.",
442 vm_guest_modes[i].mode, 494 i, vm_guest_mode_string(i));
443 vm_guest_mode_string(vm_guest_modes[i].mode)); 495 run_test(i, iterations, interval, phys_offset);
444 run_test(vm_guest_modes[i].mode, iterations, interval, top_offset);
445 } 496 }
446 497
447 return 0; 498 return 0;
diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h
index a4e59e3b4826..a84785b02557 100644
--- a/tools/testing/selftests/kvm/include/kvm_util.h
+++ b/tools/testing/selftests/kvm/include/kvm_util.h
@@ -36,6 +36,8 @@ typedef uint64_t vm_vaddr_t; /* Virtual Machine (Guest) virtual address */
36enum vm_guest_mode { 36enum vm_guest_mode {
37 VM_MODE_P52V48_4K, 37 VM_MODE_P52V48_4K,
38 VM_MODE_P52V48_64K, 38 VM_MODE_P52V48_64K,
39 VM_MODE_P48V48_4K,
40 VM_MODE_P48V48_64K,
39 VM_MODE_P40V48_4K, 41 VM_MODE_P40V48_4K,
40 VM_MODE_P40V48_64K, 42 VM_MODE_P40V48_64K,
41 NUM_VM_MODES, 43 NUM_VM_MODES,
@@ -54,10 +56,14 @@ int kvm_check_cap(long cap);
54int vm_enable_cap(struct kvm_vm *vm, struct kvm_enable_cap *cap); 56int vm_enable_cap(struct kvm_vm *vm, struct kvm_enable_cap *cap);
55 57
56struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm); 58struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm);
59struct kvm_vm *_vm_create(enum vm_guest_mode mode, uint64_t phy_pages,
60 int perm, unsigned long type);
57void kvm_vm_free(struct kvm_vm *vmp); 61void kvm_vm_free(struct kvm_vm *vmp);
58void kvm_vm_restart(struct kvm_vm *vmp, int perm); 62void kvm_vm_restart(struct kvm_vm *vmp, int perm);
59void kvm_vm_release(struct kvm_vm *vmp); 63void kvm_vm_release(struct kvm_vm *vmp);
60void kvm_vm_get_dirty_log(struct kvm_vm *vm, int slot, void *log); 64void kvm_vm_get_dirty_log(struct kvm_vm *vm, int slot, void *log);
65void kvm_vm_clear_dirty_log(struct kvm_vm *vm, int slot, void *log,
66 uint64_t first_page, uint32_t num_pages);
61 67
62int kvm_memcmp_hva_gva(void *hva, struct kvm_vm *vm, const vm_vaddr_t gva, 68int kvm_memcmp_hva_gva(void *hva, struct kvm_vm *vm, const vm_vaddr_t gva,
63 size_t len); 69 size_t len);
@@ -78,6 +84,8 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm,
78 84
79void vcpu_ioctl(struct kvm_vm *vm, uint32_t vcpuid, unsigned long ioctl, 85void vcpu_ioctl(struct kvm_vm *vm, uint32_t vcpuid, unsigned long ioctl,
80 void *arg); 86 void *arg);
87int _vcpu_ioctl(struct kvm_vm *vm, uint32_t vcpuid, unsigned long ioctl,
88 void *arg);
81void vm_ioctl(struct kvm_vm *vm, unsigned long ioctl, void *arg); 89void vm_ioctl(struct kvm_vm *vm, unsigned long ioctl, void *arg);
82void vm_mem_region_set_flags(struct kvm_vm *vm, uint32_t slot, uint32_t flags); 90void vm_mem_region_set_flags(struct kvm_vm *vm, uint32_t slot, uint32_t flags);
83void vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpuid, int pgd_memslot, 91void vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpuid, int pgd_memslot,
diff --git a/tools/testing/selftests/kvm/lib/aarch64/processor.c b/tools/testing/selftests/kvm/lib/aarch64/processor.c
index b6022e2f116e..e8c42506a09d 100644
--- a/tools/testing/selftests/kvm/lib/aarch64/processor.c
+++ b/tools/testing/selftests/kvm/lib/aarch64/processor.c
@@ -268,13 +268,20 @@ void vcpu_setup(struct kvm_vm *vm, int vcpuid, int pgd_memslot, int gdt_memslot)
268 268
269 switch (vm->mode) { 269 switch (vm->mode) {
270 case VM_MODE_P52V48_4K: 270 case VM_MODE_P52V48_4K:
271 tcr_el1 |= 0ul << 14; /* TG0 = 4KB */ 271 TEST_ASSERT(false, "AArch64 does not support 4K sized pages "
272 tcr_el1 |= 6ul << 32; /* IPS = 52 bits */ 272 "with 52-bit physical address ranges");
273 break;
274 case VM_MODE_P52V48_64K: 273 case VM_MODE_P52V48_64K:
275 tcr_el1 |= 1ul << 14; /* TG0 = 64KB */ 274 tcr_el1 |= 1ul << 14; /* TG0 = 64KB */
276 tcr_el1 |= 6ul << 32; /* IPS = 52 bits */ 275 tcr_el1 |= 6ul << 32; /* IPS = 52 bits */
277 break; 276 break;
277 case VM_MODE_P48V48_4K:
278 tcr_el1 |= 0ul << 14; /* TG0 = 4KB */
279 tcr_el1 |= 5ul << 32; /* IPS = 48 bits */
280 break;
281 case VM_MODE_P48V48_64K:
282 tcr_el1 |= 1ul << 14; /* TG0 = 64KB */
283 tcr_el1 |= 5ul << 32; /* IPS = 48 bits */
284 break;
278 case VM_MODE_P40V48_4K: 285 case VM_MODE_P40V48_4K:
279 tcr_el1 |= 0ul << 14; /* TG0 = 4KB */ 286 tcr_el1 |= 0ul << 14; /* TG0 = 4KB */
280 tcr_el1 |= 2ul << 32; /* IPS = 40 bits */ 287 tcr_el1 |= 2ul << 32; /* IPS = 40 bits */
@@ -305,7 +312,6 @@ void vcpu_dump(FILE *stream, struct kvm_vm *vm, uint32_t vcpuid, uint8_t indent)
305 get_reg(vm, vcpuid, ARM64_CORE_REG(regs.pstate), &pstate); 312 get_reg(vm, vcpuid, ARM64_CORE_REG(regs.pstate), &pstate);
306 get_reg(vm, vcpuid, ARM64_CORE_REG(regs.pc), &pc); 313 get_reg(vm, vcpuid, ARM64_CORE_REG(regs.pc), &pc);
307 314
308 fprintf(stream, "%*spstate: 0x%.16llx pc: 0x%.16llx\n", 315 fprintf(stream, "%*spstate: 0x%.16llx pc: 0x%.16llx\n",
309 indent, "", pstate, pc); 316 indent, "", pstate, pc);
310
311} 317}
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
index 1b41e71283d5..23022e9d32eb 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -85,13 +85,13 @@ int vm_enable_cap(struct kvm_vm *vm, struct kvm_enable_cap *cap)
85 return ret; 85 return ret;
86} 86}
87 87
88static void vm_open(struct kvm_vm *vm, int perm) 88static void vm_open(struct kvm_vm *vm, int perm, unsigned long type)
89{ 89{
90 vm->kvm_fd = open(KVM_DEV_PATH, perm); 90 vm->kvm_fd = open(KVM_DEV_PATH, perm);
91 if (vm->kvm_fd < 0) 91 if (vm->kvm_fd < 0)
92 exit(KSFT_SKIP); 92 exit(KSFT_SKIP);
93 93
94 vm->fd = ioctl(vm->kvm_fd, KVM_CREATE_VM, NULL); 94 vm->fd = ioctl(vm->kvm_fd, KVM_CREATE_VM, type);
95 TEST_ASSERT(vm->fd >= 0, "KVM_CREATE_VM ioctl failed, " 95 TEST_ASSERT(vm->fd >= 0, "KVM_CREATE_VM ioctl failed, "
96 "rc: %i errno: %i", vm->fd, errno); 96 "rc: %i errno: %i", vm->fd, errno);
97} 97}
@@ -99,9 +99,13 @@ static void vm_open(struct kvm_vm *vm, int perm)
99const char * const vm_guest_mode_string[] = { 99const char * const vm_guest_mode_string[] = {
100 "PA-bits:52, VA-bits:48, 4K pages", 100 "PA-bits:52, VA-bits:48, 4K pages",
101 "PA-bits:52, VA-bits:48, 64K pages", 101 "PA-bits:52, VA-bits:48, 64K pages",
102 "PA-bits:48, VA-bits:48, 4K pages",
103 "PA-bits:48, VA-bits:48, 64K pages",
102 "PA-bits:40, VA-bits:48, 4K pages", 104 "PA-bits:40, VA-bits:48, 4K pages",
103 "PA-bits:40, VA-bits:48, 64K pages", 105 "PA-bits:40, VA-bits:48, 64K pages",
104}; 106};
107_Static_assert(sizeof(vm_guest_mode_string)/sizeof(char *) == NUM_VM_MODES,
108 "Missing new mode strings?");
105 109
106/* 110/*
107 * VM Create 111 * VM Create
@@ -122,7 +126,8 @@ const char * const vm_guest_mode_string[] = {
122 * descriptor to control the created VM is created with the permissions 126 * descriptor to control the created VM is created with the permissions
123 * given by perm (e.g. O_RDWR). 127 * given by perm (e.g. O_RDWR).
124 */ 128 */
125struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm) 129struct kvm_vm *_vm_create(enum vm_guest_mode mode, uint64_t phy_pages,
130 int perm, unsigned long type)
126{ 131{
127 struct kvm_vm *vm; 132 struct kvm_vm *vm;
128 int kvm_fd; 133 int kvm_fd;
@@ -131,22 +136,38 @@ struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm)
131 TEST_ASSERT(vm != NULL, "Insufficient Memory"); 136 TEST_ASSERT(vm != NULL, "Insufficient Memory");
132 137
133 vm->mode = mode; 138 vm->mode = mode;
134 vm_open(vm, perm); 139 vm->type = type;
140 vm_open(vm, perm, type);
135 141
136 /* Setup mode specific traits. */ 142 /* Setup mode specific traits. */
137 switch (vm->mode) { 143 switch (vm->mode) {
138 case VM_MODE_P52V48_4K: 144 case VM_MODE_P52V48_4K:
139 vm->pgtable_levels = 4; 145 vm->pgtable_levels = 4;
146 vm->pa_bits = 52;
147 vm->va_bits = 48;
140 vm->page_size = 0x1000; 148 vm->page_size = 0x1000;
141 vm->page_shift = 12; 149 vm->page_shift = 12;
142 vm->va_bits = 48;
143 break; 150 break;
144 case VM_MODE_P52V48_64K: 151 case VM_MODE_P52V48_64K:
145 vm->pgtable_levels = 3; 152 vm->pgtable_levels = 3;
146 vm->pa_bits = 52; 153 vm->pa_bits = 52;
154 vm->va_bits = 48;
147 vm->page_size = 0x10000; 155 vm->page_size = 0x10000;
148 vm->page_shift = 16; 156 vm->page_shift = 16;
157 break;
158 case VM_MODE_P48V48_4K:
159 vm->pgtable_levels = 4;
160 vm->pa_bits = 48;
161 vm->va_bits = 48;
162 vm->page_size = 0x1000;
163 vm->page_shift = 12;
164 break;
165 case VM_MODE_P48V48_64K:
166 vm->pgtable_levels = 3;
167 vm->pa_bits = 48;
149 vm->va_bits = 48; 168 vm->va_bits = 48;
169 vm->page_size = 0x10000;
170 vm->page_shift = 16;
150 break; 171 break;
151 case VM_MODE_P40V48_4K: 172 case VM_MODE_P40V48_4K:
152 vm->pgtable_levels = 4; 173 vm->pgtable_levels = 4;
@@ -186,6 +207,11 @@ struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm)
186 return vm; 207 return vm;
187} 208}
188 209
210struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm)
211{
212 return _vm_create(mode, phy_pages, perm, 0);
213}
214
189/* 215/*
190 * VM Restart 216 * VM Restart
191 * 217 *
@@ -203,7 +229,7 @@ void kvm_vm_restart(struct kvm_vm *vmp, int perm)
203{ 229{
204 struct userspace_mem_region *region; 230 struct userspace_mem_region *region;
205 231
206 vm_open(vmp, perm); 232 vm_open(vmp, perm, vmp->type);
207 if (vmp->has_irqchip) 233 if (vmp->has_irqchip)
208 vm_create_irqchip(vmp); 234 vm_create_irqchip(vmp);
209 235
@@ -231,6 +257,19 @@ void kvm_vm_get_dirty_log(struct kvm_vm *vm, int slot, void *log)
231 strerror(-ret)); 257 strerror(-ret));
232} 258}
233 259
260void kvm_vm_clear_dirty_log(struct kvm_vm *vm, int slot, void *log,
261 uint64_t first_page, uint32_t num_pages)
262{
263 struct kvm_clear_dirty_log args = { .dirty_bitmap = log, .slot = slot,
264 .first_page = first_page,
265 .num_pages = num_pages };
266 int ret;
267
268 ret = ioctl(vm->fd, KVM_CLEAR_DIRTY_LOG, &args);
269 TEST_ASSERT(ret == 0, "%s: KVM_CLEAR_DIRTY_LOG failed: %s",
270 strerror(-ret));
271}
272
234/* 273/*
235 * Userspace Memory Region Find 274 * Userspace Memory Region Find
236 * 275 *
@@ -1270,14 +1309,24 @@ int _vcpu_sregs_set(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_sregs *sregs)
1270void vcpu_ioctl(struct kvm_vm *vm, uint32_t vcpuid, 1309void vcpu_ioctl(struct kvm_vm *vm, uint32_t vcpuid,
1271 unsigned long cmd, void *arg) 1310 unsigned long cmd, void *arg)
1272{ 1311{
1312 int ret;
1313
1314 ret = _vcpu_ioctl(vm, vcpuid, cmd, arg);
1315 TEST_ASSERT(ret == 0, "vcpu ioctl %lu failed, rc: %i errno: %i (%s)",
1316 cmd, ret, errno, strerror(errno));
1317}
1318
1319int _vcpu_ioctl(struct kvm_vm *vm, uint32_t vcpuid,
1320 unsigned long cmd, void *arg)
1321{
1273 struct vcpu *vcpu = vcpu_find(vm, vcpuid); 1322 struct vcpu *vcpu = vcpu_find(vm, vcpuid);
1274 int ret; 1323 int ret;
1275 1324
1276 TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); 1325 TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid);
1277 1326
1278 ret = ioctl(vcpu->fd, cmd, arg); 1327 ret = ioctl(vcpu->fd, cmd, arg);
1279 TEST_ASSERT(ret == 0, "vcpu ioctl %lu failed, rc: %i errno: %i (%s)", 1328
1280 cmd, ret, errno, strerror(errno)); 1329 return ret;
1281} 1330}
1282 1331
1283/* 1332/*
@@ -1422,7 +1471,7 @@ const char *exit_reason_str(unsigned int exit_reason)
1422 * 1471 *
1423 * Within the VM specified by vm, locates a range of available physical 1472 * Within the VM specified by vm, locates a range of available physical
1424 * pages at or above paddr_min. If found, the pages are marked as in use 1473 * pages at or above paddr_min. If found, the pages are marked as in use
1425 * and thier base address is returned. A TEST_ASSERT failure occurs if 1474 * and their base address is returned. A TEST_ASSERT failure occurs if
1426 * not enough pages are available at or above paddr_min. 1475 * not enough pages are available at or above paddr_min.
1427 */ 1476 */
1428vm_paddr_t vm_phy_pages_alloc(struct kvm_vm *vm, size_t num, 1477vm_paddr_t vm_phy_pages_alloc(struct kvm_vm *vm, size_t num,
diff --git a/tools/testing/selftests/kvm/lib/kvm_util_internal.h b/tools/testing/selftests/kvm/lib/kvm_util_internal.h
index 52701db0f253..4595e42c6e29 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util_internal.h
+++ b/tools/testing/selftests/kvm/lib/kvm_util_internal.h
@@ -44,6 +44,7 @@ struct vcpu {
44 44
45struct kvm_vm { 45struct kvm_vm {
46 int mode; 46 int mode;
47 unsigned long type;
47 int kvm_fd; 48 int kvm_fd;
48 int fd; 49 int fd;
49 unsigned int pgtable_levels; 50 unsigned int pgtable_levels;
diff --git a/tools/testing/selftests/kvm/lib/ucall.c b/tools/testing/selftests/kvm/lib/ucall.c
index 4777f9bb5194..a2ab38be2f47 100644
--- a/tools/testing/selftests/kvm/lib/ucall.c
+++ b/tools/testing/selftests/kvm/lib/ucall.c
@@ -34,7 +34,8 @@ void ucall_init(struct kvm_vm *vm, ucall_type_t type, void *arg)
34 return; 34 return;
35 35
36 if (type == UCALL_MMIO) { 36 if (type == UCALL_MMIO) {
37 vm_paddr_t gpa, start, end, step; 37 vm_paddr_t gpa, start, end, step, offset;
38 unsigned bits;
38 bool ret; 39 bool ret;
39 40
40 if (arg) { 41 if (arg) {
@@ -45,25 +46,30 @@ void ucall_init(struct kvm_vm *vm, ucall_type_t type, void *arg)
45 } 46 }
46 47
47 /* 48 /*
48 * Find an address within the allowed virtual address space, 49 * Find an address within the allowed physical and virtual address
49 * that does _not_ have a KVM memory region associated with it. 50 * spaces, that does _not_ have a KVM memory region associated with
50 * Identity mapping an address like this allows the guest to 51 * it. Identity mapping an address like this allows the guest to
51 * access it, but as KVM doesn't know what to do with it, it 52 * access it, but as KVM doesn't know what to do with it, it
52 * will assume it's something userspace handles and exit with 53 * will assume it's something userspace handles and exit with
53 * KVM_EXIT_MMIO. Well, at least that's how it works for AArch64. 54 * KVM_EXIT_MMIO. Well, at least that's how it works for AArch64.
54 * Here we start with a guess that the addresses around two 55 * Here we start with a guess that the addresses around 5/8th
55 * thirds of the VA space are unmapped and then work both down 56 * of the allowed space are unmapped and then work both down and
56 * and up from there in 1/6 VA space sized steps. 57 * up from there in 1/16th allowed space sized steps.
58 *
59 * Note, we need to use VA-bits - 1 when calculating the allowed
60 * virtual address space for an identity mapping because the upper
61 * half of the virtual address space is the two's complement of the
62 * lower and won't match physical addresses.
57 */ 63 */
58 start = 1ul << (vm->va_bits * 2 / 3); 64 bits = vm->va_bits - 1;
59 end = 1ul << vm->va_bits; 65 bits = vm->pa_bits < bits ? vm->pa_bits : bits;
60 step = 1ul << (vm->va_bits / 6); 66 end = 1ul << bits;
61 for (gpa = start; gpa >= 0; gpa -= step) { 67 start = end * 5 / 8;
62 if (ucall_mmio_init(vm, gpa & ~(vm->page_size - 1))) 68 step = end / 16;
69 for (offset = 0; offset < end - start; offset += step) {
70 if (ucall_mmio_init(vm, start - offset))
63 return; 71 return;
64 } 72 if (ucall_mmio_init(vm, start + offset))
65 for (gpa = start + step; gpa < end; gpa += step) {
66 if (ucall_mmio_init(vm, gpa & ~(vm->page_size - 1)))
67 return; 73 return;
68 } 74 }
69 TEST_ASSERT(false, "Can't find a ucall mmio address"); 75 TEST_ASSERT(false, "Can't find a ucall mmio address");
diff --git a/tools/testing/selftests/kvm/x86_64/evmcs_test.c b/tools/testing/selftests/kvm/x86_64/evmcs_test.c
index 92c2cfd1b182..ea3c73e8f4f6 100644
--- a/tools/testing/selftests/kvm/x86_64/evmcs_test.c
+++ b/tools/testing/selftests/kvm/x86_64/evmcs_test.c
@@ -113,8 +113,8 @@ int main(int argc, char *argv[])
113 for (stage = 1;; stage++) { 113 for (stage = 1;; stage++) {
114 _vcpu_run(vm, VCPU_ID); 114 _vcpu_run(vm, VCPU_ID);
115 TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, 115 TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
116 "Unexpected exit reason: %u (%s),\n", 116 "Stage %d: unexpected exit reason: %u (%s),\n",
117 run->exit_reason, 117 stage, run->exit_reason,
118 exit_reason_str(run->exit_reason)); 118 exit_reason_str(run->exit_reason));
119 119
120 memset(&regs1, 0, sizeof(regs1)); 120 memset(&regs1, 0, sizeof(regs1));
diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c b/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c
new file mode 100644
index 000000000000..264425f75806
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c
@@ -0,0 +1,157 @@
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Test for x86 KVM_CAP_HYPERV_CPUID
4 *
5 * Copyright (C) 2018, Red Hat, Inc.
6 *
7 * This work is licensed under the terms of the GNU GPL, version 2.
8 *
9 */
10
11#define _GNU_SOURCE /* for program_invocation_short_name */
12#include <fcntl.h>
13#include <stdio.h>
14#include <stdlib.h>
15#include <string.h>
16#include <sys/ioctl.h>
17
18#include "test_util.h"
19#include "kvm_util.h"
20#include "processor.h"
21
22#define VCPU_ID 0
23
24static void guest_code(void)
25{
26}
27
28static void test_hv_cpuid(struct kvm_cpuid2 *hv_cpuid_entries,
29 int evmcs_enabled)
30{
31 int i;
32
33 if (!evmcs_enabled)
34 TEST_ASSERT(hv_cpuid_entries->nent == 6,
35 "KVM_GET_SUPPORTED_HV_CPUID should return 6 entries"
36 " when Enlightened VMCS is disabled (returned %d)",
37 hv_cpuid_entries->nent);
38 else
39 TEST_ASSERT(hv_cpuid_entries->nent == 7,
40 "KVM_GET_SUPPORTED_HV_CPUID should return 7 entries"
41 " when Enlightened VMCS is enabled (returned %d)",
42 hv_cpuid_entries->nent);
43
44 for (i = 0; i < hv_cpuid_entries->nent; i++) {
45 struct kvm_cpuid_entry2 *entry = &hv_cpuid_entries->entries[i];
46
47 TEST_ASSERT((entry->function >= 0x40000000) &&
48 (entry->function <= 0x4000000A),
49 "function %lx is our of supported range",
50 entry->function);
51
52 TEST_ASSERT(entry->index == 0,
53 ".index field should be zero");
54
55 TEST_ASSERT(entry->index == 0,
56 ".index field should be zero");
57
58 TEST_ASSERT(entry->flags == 0,
59 ".flags field should be zero");
60
61 TEST_ASSERT(entry->padding[0] == entry->padding[1]
62 == entry->padding[2] == 0,
63 ".index field should be zero");
64
65 /*
66 * If needed for debug:
67 * fprintf(stdout,
68 * "CPUID%lx EAX=0x%lx EBX=0x%lx ECX=0x%lx EDX=0x%lx\n",
69 * entry->function, entry->eax, entry->ebx, entry->ecx,
70 * entry->edx);
71 */
72 }
73
74}
75
76void test_hv_cpuid_e2big(struct kvm_vm *vm)
77{
78 static struct kvm_cpuid2 cpuid = {.nent = 0};
79 int ret;
80
81 ret = _vcpu_ioctl(vm, VCPU_ID, KVM_GET_SUPPORTED_HV_CPUID, &cpuid);
82
83 TEST_ASSERT(ret == -1 && errno == E2BIG,
84 "KVM_GET_SUPPORTED_HV_CPUID didn't fail with -E2BIG when"
85 " it should have: %d %d", ret, errno);
86}
87
88
89struct kvm_cpuid2 *kvm_get_supported_hv_cpuid(struct kvm_vm *vm)
90{
91 int nent = 20; /* should be enough */
92 static struct kvm_cpuid2 *cpuid;
93 int ret;
94
95 cpuid = malloc(sizeof(*cpuid) + nent * sizeof(struct kvm_cpuid_entry2));
96
97 if (!cpuid) {
98 perror("malloc");
99 abort();
100 }
101
102 cpuid->nent = nent;
103
104 vcpu_ioctl(vm, VCPU_ID, KVM_GET_SUPPORTED_HV_CPUID, cpuid);
105
106 return cpuid;
107}
108
109
110int main(int argc, char *argv[])
111{
112 struct kvm_vm *vm;
113 int rv;
114 uint16_t evmcs_ver;
115 struct kvm_cpuid2 *hv_cpuid_entries;
116 struct kvm_enable_cap enable_evmcs_cap = {
117 .cap = KVM_CAP_HYPERV_ENLIGHTENED_VMCS,
118 .args[0] = (unsigned long)&evmcs_ver
119 };
120
121 /* Tell stdout not to buffer its content */
122 setbuf(stdout, NULL);
123
124 rv = kvm_check_cap(KVM_CAP_HYPERV_CPUID);
125 if (!rv) {
126 fprintf(stderr,
127 "KVM_CAP_HYPERV_CPUID not supported, skip test\n");
128 exit(KSFT_SKIP);
129 }
130
131 /* Create VM */
132 vm = vm_create_default(VCPU_ID, 0, guest_code);
133
134 test_hv_cpuid_e2big(vm);
135
136 hv_cpuid_entries = kvm_get_supported_hv_cpuid(vm);
137 if (!hv_cpuid_entries)
138 return 1;
139
140 test_hv_cpuid(hv_cpuid_entries, 0);
141
142 free(hv_cpuid_entries);
143
144 vcpu_ioctl(vm, VCPU_ID, KVM_ENABLE_CAP, &enable_evmcs_cap);
145
146 hv_cpuid_entries = kvm_get_supported_hv_cpuid(vm);
147 if (!hv_cpuid_entries)
148 return 1;
149
150 test_hv_cpuid(hv_cpuid_entries, 1);
151
152 free(hv_cpuid_entries);
153
154 kvm_vm_free(vm);
155
156 return 0;
157}
diff --git a/tools/testing/selftests/kvm/x86_64/state_test.c b/tools/testing/selftests/kvm/x86_64/state_test.c
index 03da41f0f736..4b3f556265f1 100644
--- a/tools/testing/selftests/kvm/x86_64/state_test.c
+++ b/tools/testing/selftests/kvm/x86_64/state_test.c
@@ -152,8 +152,8 @@ int main(int argc, char *argv[])
152 for (stage = 1;; stage++) { 152 for (stage = 1;; stage++) {
153 _vcpu_run(vm, VCPU_ID); 153 _vcpu_run(vm, VCPU_ID);
154 TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, 154 TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
155 "Unexpected exit reason: %u (%s),\n", 155 "Stage %d: unexpected exit reason: %u (%s),\n",
156 run->exit_reason, 156 stage, run->exit_reason,
157 exit_reason_str(run->exit_reason)); 157 exit_reason_str(run->exit_reason));
158 158
159 memset(&regs1, 0, sizeof(regs1)); 159 memset(&regs1, 0, sizeof(regs1));
diff --git a/tools/testing/selftests/lib.mk b/tools/testing/selftests/lib.mk
index 0a8e75886224..8b0f16409ed7 100644
--- a/tools/testing/selftests/lib.mk
+++ b/tools/testing/selftests/lib.mk
@@ -16,18 +16,18 @@ TEST_GEN_PROGS := $(patsubst %,$(OUTPUT)/%,$(TEST_GEN_PROGS))
16TEST_GEN_PROGS_EXTENDED := $(patsubst %,$(OUTPUT)/%,$(TEST_GEN_PROGS_EXTENDED)) 16TEST_GEN_PROGS_EXTENDED := $(patsubst %,$(OUTPUT)/%,$(TEST_GEN_PROGS_EXTENDED))
17TEST_GEN_FILES := $(patsubst %,$(OUTPUT)/%,$(TEST_GEN_FILES)) 17TEST_GEN_FILES := $(patsubst %,$(OUTPUT)/%,$(TEST_GEN_FILES))
18 18
19ifdef KSFT_KHDR_INSTALL
19top_srcdir ?= ../../../.. 20top_srcdir ?= ../../../..
20include $(top_srcdir)/scripts/subarch.include 21include $(top_srcdir)/scripts/subarch.include
21ARCH ?= $(SUBARCH) 22ARCH ?= $(SUBARCH)
22 23
23all: $(TEST_GEN_PROGS) $(TEST_GEN_PROGS_EXTENDED) $(TEST_GEN_FILES)
24
25.PHONY: khdr 24.PHONY: khdr
26khdr: 25khdr:
27 make ARCH=$(ARCH) -C $(top_srcdir) headers_install 26 make ARCH=$(ARCH) -C $(top_srcdir) headers_install
28 27
29ifdef KSFT_KHDR_INSTALL 28all: khdr $(TEST_GEN_PROGS) $(TEST_GEN_PROGS_EXTENDED) $(TEST_GEN_FILES)
30$(TEST_GEN_PROGS) $(TEST_GEN_PROGS_EXTENDED) $(TEST_GEN_FILES):| khdr 29else
30all: $(TEST_GEN_PROGS) $(TEST_GEN_PROGS_EXTENDED) $(TEST_GEN_FILES)
31endif 31endif
32 32
33.ONESHELL: 33.ONESHELL:
diff --git a/tools/testing/selftests/networking/timestamping/Makefile b/tools/testing/selftests/networking/timestamping/Makefile
index 14cfcf006936..c46c0eefab9e 100644
--- a/tools/testing/selftests/networking/timestamping/Makefile
+++ b/tools/testing/selftests/networking/timestamping/Makefile
@@ -6,6 +6,7 @@ TEST_PROGS := hwtstamp_config rxtimestamp timestamping txtimestamp
6all: $(TEST_PROGS) 6all: $(TEST_PROGS)
7 7
8top_srcdir = ../../../../.. 8top_srcdir = ../../../../..
9KSFT_KHDR_INSTALL := 1
9include ../../lib.mk 10include ../../lib.mk
10 11
11clean: 12clean:
diff --git a/tools/testing/selftests/tc-testing/bpf/Makefile b/tools/testing/selftests/tc-testing/bpf/Makefile
index dc92eb271d9a..be5a5e542804 100644
--- a/tools/testing/selftests/tc-testing/bpf/Makefile
+++ b/tools/testing/selftests/tc-testing/bpf/Makefile
@@ -4,6 +4,7 @@ APIDIR := ../../../../include/uapi
4TEST_GEN_FILES = action.o 4TEST_GEN_FILES = action.o
5 5
6top_srcdir = ../../../../.. 6top_srcdir = ../../../../..
7KSFT_KHDR_INSTALL := 1
7include ../../lib.mk 8include ../../lib.mk
8 9
9CLANG ?= clang 10CLANG ?= clang
diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile
index 6e67e726e5a5..e13eb6cc8901 100644
--- a/tools/testing/selftests/vm/Makefile
+++ b/tools/testing/selftests/vm/Makefile
@@ -25,6 +25,7 @@ TEST_GEN_FILES += virtual_address_range
25 25
26TEST_PROGS := run_vmtests 26TEST_PROGS := run_vmtests
27 27
28KSFT_KHDR_INSTALL := 1
28include ../lib.mk 29include ../lib.mk
29 30
30$(OUTPUT)/userfaultfd: LDLIBS += -lpthread 31$(OUTPUT)/userfaultfd: LDLIBS += -lpthread
diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c
index 17cecc96f735..b07ac4614e1c 100644
--- a/virt/kvm/arm/arch_timer.c
+++ b/virt/kvm/arm/arch_timer.c
@@ -70,11 +70,9 @@ static void soft_timer_start(struct hrtimer *hrt, u64 ns)
70 HRTIMER_MODE_ABS); 70 HRTIMER_MODE_ABS);
71} 71}
72 72
73static void soft_timer_cancel(struct hrtimer *hrt, struct work_struct *work) 73static void soft_timer_cancel(struct hrtimer *hrt)
74{ 74{
75 hrtimer_cancel(hrt); 75 hrtimer_cancel(hrt);
76 if (work)
77 cancel_work_sync(work);
78} 76}
79 77
80static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id) 78static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id)
@@ -102,23 +100,6 @@ static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id)
102 return IRQ_HANDLED; 100 return IRQ_HANDLED;
103} 101}
104 102
105/*
106 * Work function for handling the backup timer that we schedule when a vcpu is
107 * no longer running, but had a timer programmed to fire in the future.
108 */
109static void kvm_timer_inject_irq_work(struct work_struct *work)
110{
111 struct kvm_vcpu *vcpu;
112
113 vcpu = container_of(work, struct kvm_vcpu, arch.timer_cpu.expired);
114
115 /*
116 * If the vcpu is blocked we want to wake it up so that it will see
117 * the timer has expired when entering the guest.
118 */
119 kvm_vcpu_wake_up(vcpu);
120}
121
122static u64 kvm_timer_compute_delta(struct arch_timer_context *timer_ctx) 103static u64 kvm_timer_compute_delta(struct arch_timer_context *timer_ctx)
123{ 104{
124 u64 cval, now; 105 u64 cval, now;
@@ -188,7 +169,7 @@ static enum hrtimer_restart kvm_bg_timer_expire(struct hrtimer *hrt)
188 return HRTIMER_RESTART; 169 return HRTIMER_RESTART;
189 } 170 }
190 171
191 schedule_work(&timer->expired); 172 kvm_vcpu_wake_up(vcpu);
192 return HRTIMER_NORESTART; 173 return HRTIMER_NORESTART;
193} 174}
194 175
@@ -300,7 +281,7 @@ static void phys_timer_emulate(struct kvm_vcpu *vcpu)
300 * then we also don't need a soft timer. 281 * then we also don't need a soft timer.
301 */ 282 */
302 if (kvm_timer_should_fire(ptimer) || !kvm_timer_irq_can_fire(ptimer)) { 283 if (kvm_timer_should_fire(ptimer) || !kvm_timer_irq_can_fire(ptimer)) {
303 soft_timer_cancel(&timer->phys_timer, NULL); 284 soft_timer_cancel(&timer->phys_timer);
304 return; 285 return;
305 } 286 }
306 287
@@ -426,7 +407,7 @@ void kvm_timer_unschedule(struct kvm_vcpu *vcpu)
426 407
427 vtimer_restore_state(vcpu); 408 vtimer_restore_state(vcpu);
428 409
429 soft_timer_cancel(&timer->bg_timer, &timer->expired); 410 soft_timer_cancel(&timer->bg_timer);
430} 411}
431 412
432static void set_cntvoff(u64 cntvoff) 413static void set_cntvoff(u64 cntvoff)
@@ -544,7 +525,7 @@ void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu)
544 * In any case, we re-schedule the hrtimer for the physical timer when 525 * In any case, we re-schedule the hrtimer for the physical timer when
545 * coming back to the VCPU thread in kvm_timer_vcpu_load(). 526 * coming back to the VCPU thread in kvm_timer_vcpu_load().
546 */ 527 */
547 soft_timer_cancel(&timer->phys_timer, NULL); 528 soft_timer_cancel(&timer->phys_timer);
548 529
549 /* 530 /*
550 * The kernel may decide to run userspace after calling vcpu_put, so 531 * The kernel may decide to run userspace after calling vcpu_put, so
@@ -637,7 +618,6 @@ void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu)
637 update_vtimer_cntvoff(vcpu, kvm_phys_timer_read()); 618 update_vtimer_cntvoff(vcpu, kvm_phys_timer_read());
638 vcpu_ptimer(vcpu)->cntvoff = 0; 619 vcpu_ptimer(vcpu)->cntvoff = 0;
639 620
640 INIT_WORK(&timer->expired, kvm_timer_inject_irq_work);
641 hrtimer_init(&timer->bg_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); 621 hrtimer_init(&timer->bg_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
642 timer->bg_timer.function = kvm_bg_timer_expire; 622 timer->bg_timer.function = kvm_bg_timer_expire;
643 623
@@ -792,11 +772,8 @@ out_free_irq:
792void kvm_timer_vcpu_terminate(struct kvm_vcpu *vcpu) 772void kvm_timer_vcpu_terminate(struct kvm_vcpu *vcpu)
793{ 773{
794 struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; 774 struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
795 struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
796 775
797 soft_timer_cancel(&timer->bg_timer, &timer->expired); 776 soft_timer_cancel(&timer->bg_timer);
798 soft_timer_cancel(&timer->phys_timer, NULL);
799 kvm_vgic_unmap_phys_irq(vcpu, vtimer->irq.irq);
800} 777}
801 778
802static bool timer_irqs_are_valid(struct kvm_vcpu *vcpu) 779static bool timer_irqs_are_valid(struct kvm_vcpu *vcpu)
diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c
index 36165748a315..9e350fd34504 100644
--- a/virt/kvm/arm/arm.c
+++ b/virt/kvm/arm/arm.c
@@ -66,7 +66,7 @@ static DEFINE_PER_CPU(struct kvm_vcpu *, kvm_arm_running_vcpu);
66static atomic64_t kvm_vmid_gen = ATOMIC64_INIT(1); 66static atomic64_t kvm_vmid_gen = ATOMIC64_INIT(1);
67static u32 kvm_next_vmid; 67static u32 kvm_next_vmid;
68static unsigned int kvm_vmid_bits __read_mostly; 68static unsigned int kvm_vmid_bits __read_mostly;
69static DEFINE_RWLOCK(kvm_vmid_lock); 69static DEFINE_SPINLOCK(kvm_vmid_lock);
70 70
71static bool vgic_present; 71static bool vgic_present;
72 72
@@ -484,7 +484,9 @@ void force_vm_exit(const cpumask_t *mask)
484 */ 484 */
485static bool need_new_vmid_gen(struct kvm *kvm) 485static bool need_new_vmid_gen(struct kvm *kvm)
486{ 486{
487 return unlikely(kvm->arch.vmid_gen != atomic64_read(&kvm_vmid_gen)); 487 u64 current_vmid_gen = atomic64_read(&kvm_vmid_gen);
488 smp_rmb(); /* Orders read of kvm_vmid_gen and kvm->arch.vmid */
489 return unlikely(READ_ONCE(kvm->arch.vmid_gen) != current_vmid_gen);
488} 490}
489 491
490/** 492/**
@@ -499,16 +501,11 @@ static void update_vttbr(struct kvm *kvm)
499{ 501{
500 phys_addr_t pgd_phys; 502 phys_addr_t pgd_phys;
501 u64 vmid, cnp = kvm_cpu_has_cnp() ? VTTBR_CNP_BIT : 0; 503 u64 vmid, cnp = kvm_cpu_has_cnp() ? VTTBR_CNP_BIT : 0;
502 bool new_gen;
503 504
504 read_lock(&kvm_vmid_lock); 505 if (!need_new_vmid_gen(kvm))
505 new_gen = need_new_vmid_gen(kvm);
506 read_unlock(&kvm_vmid_lock);
507
508 if (!new_gen)
509 return; 506 return;
510 507
511 write_lock(&kvm_vmid_lock); 508 spin_lock(&kvm_vmid_lock);
512 509
513 /* 510 /*
514 * We need to re-check the vmid_gen here to ensure that if another vcpu 511 * We need to re-check the vmid_gen here to ensure that if another vcpu
@@ -516,7 +513,7 @@ static void update_vttbr(struct kvm *kvm)
516 * use the same vmid. 513 * use the same vmid.
517 */ 514 */
518 if (!need_new_vmid_gen(kvm)) { 515 if (!need_new_vmid_gen(kvm)) {
519 write_unlock(&kvm_vmid_lock); 516 spin_unlock(&kvm_vmid_lock);
520 return; 517 return;
521 } 518 }
522 519
@@ -539,7 +536,6 @@ static void update_vttbr(struct kvm *kvm)
539 kvm_call_hyp(__kvm_flush_vm_context); 536 kvm_call_hyp(__kvm_flush_vm_context);
540 } 537 }
541 538
542 kvm->arch.vmid_gen = atomic64_read(&kvm_vmid_gen);
543 kvm->arch.vmid = kvm_next_vmid; 539 kvm->arch.vmid = kvm_next_vmid;
544 kvm_next_vmid++; 540 kvm_next_vmid++;
545 kvm_next_vmid &= (1 << kvm_vmid_bits) - 1; 541 kvm_next_vmid &= (1 << kvm_vmid_bits) - 1;
@@ -550,7 +546,10 @@ static void update_vttbr(struct kvm *kvm)
550 vmid = ((u64)(kvm->arch.vmid) << VTTBR_VMID_SHIFT) & VTTBR_VMID_MASK(kvm_vmid_bits); 546 vmid = ((u64)(kvm->arch.vmid) << VTTBR_VMID_SHIFT) & VTTBR_VMID_MASK(kvm_vmid_bits);
551 kvm->arch.vttbr = kvm_phys_to_vttbr(pgd_phys) | vmid | cnp; 547 kvm->arch.vttbr = kvm_phys_to_vttbr(pgd_phys) | vmid | cnp;
552 548
553 write_unlock(&kvm_vmid_lock); 549 smp_wmb();
550 WRITE_ONCE(kvm->arch.vmid_gen, atomic64_read(&kvm_vmid_gen));
551
552 spin_unlock(&kvm_vmid_lock);
554} 553}
555 554
556static int kvm_vcpu_first_run_init(struct kvm_vcpu *vcpu) 555static int kvm_vcpu_first_run_init(struct kvm_vcpu *vcpu)
@@ -674,8 +673,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
674 ret = kvm_handle_mmio_return(vcpu, vcpu->run); 673 ret = kvm_handle_mmio_return(vcpu, vcpu->run);
675 if (ret) 674 if (ret)
676 return ret; 675 return ret;
677 if (kvm_arm_handle_step_debug(vcpu, vcpu->run))
678 return 0;
679 } 676 }
680 677
681 if (run->immediate_exit) 678 if (run->immediate_exit)
@@ -1205,14 +1202,30 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
1205 */ 1202 */
1206int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) 1203int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
1207{ 1204{
1208 bool is_dirty = false; 1205 bool flush = false;
1206 int r;
1207
1208 mutex_lock(&kvm->slots_lock);
1209
1210 r = kvm_get_dirty_log_protect(kvm, log, &flush);
1211
1212 if (flush)
1213 kvm_flush_remote_tlbs(kvm);
1214
1215 mutex_unlock(&kvm->slots_lock);
1216 return r;
1217}
1218
1219int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm, struct kvm_clear_dirty_log *log)
1220{
1221 bool flush = false;
1209 int r; 1222 int r;
1210 1223
1211 mutex_lock(&kvm->slots_lock); 1224 mutex_lock(&kvm->slots_lock);
1212 1225
1213 r = kvm_get_dirty_log_protect(kvm, log, &is_dirty); 1226 r = kvm_clear_dirty_log_protect(kvm, log, &flush);
1214 1227
1215 if (is_dirty) 1228 if (flush)
1216 kvm_flush_remote_tlbs(kvm); 1229 kvm_flush_remote_tlbs(kvm);
1217 1230
1218 mutex_unlock(&kvm->slots_lock); 1231 mutex_unlock(&kvm->slots_lock);
diff --git a/virt/kvm/arm/hyp/vgic-v3-sr.c b/virt/kvm/arm/hyp/vgic-v3-sr.c
index 616e5a433ab0..9652c453480f 100644
--- a/virt/kvm/arm/hyp/vgic-v3-sr.c
+++ b/virt/kvm/arm/hyp/vgic-v3-sr.c
@@ -1012,8 +1012,10 @@ int __hyp_text __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu)
1012 1012
1013 esr = kvm_vcpu_get_hsr(vcpu); 1013 esr = kvm_vcpu_get_hsr(vcpu);
1014 if (vcpu_mode_is_32bit(vcpu)) { 1014 if (vcpu_mode_is_32bit(vcpu)) {
1015 if (!kvm_condition_valid(vcpu)) 1015 if (!kvm_condition_valid(vcpu)) {
1016 __kvm_skip_instr(vcpu);
1016 return 1; 1017 return 1;
1018 }
1017 1019
1018 sysreg = esr_cp15_to_sysreg(esr); 1020 sysreg = esr_cp15_to_sysreg(esr);
1019 } else { 1021 } else {
@@ -1123,6 +1125,8 @@ int __hyp_text __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu)
1123 rt = kvm_vcpu_sys_get_rt(vcpu); 1125 rt = kvm_vcpu_sys_get_rt(vcpu);
1124 fn(vcpu, vmcr, rt); 1126 fn(vcpu, vmcr, rt);
1125 1127
1128 __kvm_skip_instr(vcpu);
1129
1126 return 1; 1130 return 1;
1127} 1131}
1128 1132
diff --git a/virt/kvm/arm/mmio.c b/virt/kvm/arm/mmio.c
index dac7ceb1a677..08443a15e6be 100644
--- a/virt/kvm/arm/mmio.c
+++ b/virt/kvm/arm/mmio.c
@@ -117,6 +117,12 @@ int kvm_handle_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run)
117 vcpu_set_reg(vcpu, vcpu->arch.mmio_decode.rt, data); 117 vcpu_set_reg(vcpu, vcpu->arch.mmio_decode.rt, data);
118 } 118 }
119 119
120 /*
121 * The MMIO instruction is emulated and should not be re-executed
122 * in the guest.
123 */
124 kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu));
125
120 return 0; 126 return 0;
121} 127}
122 128
@@ -144,11 +150,6 @@ static int decode_hsr(struct kvm_vcpu *vcpu, bool *is_write, int *len)
144 vcpu->arch.mmio_decode.sign_extend = sign_extend; 150 vcpu->arch.mmio_decode.sign_extend = sign_extend;
145 vcpu->arch.mmio_decode.rt = rt; 151 vcpu->arch.mmio_decode.rt = rt;
146 152
147 /*
148 * The MMIO instruction is emulated and should not be re-executed
149 * in the guest.
150 */
151 kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu));
152 return 0; 153 return 0;
153} 154}
154 155
diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c
index 5eca48bdb1a6..3053bf2584f8 100644
--- a/virt/kvm/arm/mmu.c
+++ b/virt/kvm/arm/mmu.c
@@ -115,6 +115,25 @@ static void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t addr, pmd_t *pmd)
115 put_page(virt_to_page(pmd)); 115 put_page(virt_to_page(pmd));
116} 116}
117 117
118/**
119 * stage2_dissolve_pud() - clear and flush huge PUD entry
120 * @kvm: pointer to kvm structure.
121 * @addr: IPA
122 * @pud: pud pointer for IPA
123 *
124 * Function clears a PUD entry, flushes addr 1st and 2nd stage TLBs. Marks all
125 * pages in the range dirty.
126 */
127static void stage2_dissolve_pud(struct kvm *kvm, phys_addr_t addr, pud_t *pudp)
128{
129 if (!stage2_pud_huge(kvm, *pudp))
130 return;
131
132 stage2_pud_clear(kvm, pudp);
133 kvm_tlb_flush_vmid_ipa(kvm, addr);
134 put_page(virt_to_page(pudp));
135}
136
118static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, 137static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
119 int min, int max) 138 int min, int max)
120{ 139{
@@ -607,7 +626,7 @@ static void create_hyp_pte_mappings(pmd_t *pmd, unsigned long start,
607 addr = start; 626 addr = start;
608 do { 627 do {
609 pte = pte_offset_kernel(pmd, addr); 628 pte = pte_offset_kernel(pmd, addr);
610 kvm_set_pte(pte, pfn_pte(pfn, prot)); 629 kvm_set_pte(pte, kvm_pfn_pte(pfn, prot));
611 get_page(virt_to_page(pte)); 630 get_page(virt_to_page(pte));
612 pfn++; 631 pfn++;
613 } while (addr += PAGE_SIZE, addr != end); 632 } while (addr += PAGE_SIZE, addr != end);
@@ -1022,7 +1041,7 @@ static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache
1022 pmd_t *pmd; 1041 pmd_t *pmd;
1023 1042
1024 pud = stage2_get_pud(kvm, cache, addr); 1043 pud = stage2_get_pud(kvm, cache, addr);
1025 if (!pud) 1044 if (!pud || stage2_pud_huge(kvm, *pud))
1026 return NULL; 1045 return NULL;
1027 1046
1028 if (stage2_pud_none(kvm, *pud)) { 1047 if (stage2_pud_none(kvm, *pud)) {
@@ -1083,29 +1102,103 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache
1083 return 0; 1102 return 0;
1084} 1103}
1085 1104
1086static bool stage2_is_exec(struct kvm *kvm, phys_addr_t addr) 1105static int stage2_set_pud_huge(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
1106 phys_addr_t addr, const pud_t *new_pudp)
1107{
1108 pud_t *pudp, old_pud;
1109
1110 pudp = stage2_get_pud(kvm, cache, addr);
1111 VM_BUG_ON(!pudp);
1112
1113 old_pud = *pudp;
1114
1115 /*
1116 * A large number of vcpus faulting on the same stage 2 entry,
1117 * can lead to a refault due to the
1118 * stage2_pud_clear()/tlb_flush(). Skip updating the page
1119 * tables if there is no change.
1120 */
1121 if (pud_val(old_pud) == pud_val(*new_pudp))
1122 return 0;
1123
1124 if (stage2_pud_present(kvm, old_pud)) {
1125 stage2_pud_clear(kvm, pudp);
1126 kvm_tlb_flush_vmid_ipa(kvm, addr);
1127 } else {
1128 get_page(virt_to_page(pudp));
1129 }
1130
1131 kvm_set_pud(pudp, *new_pudp);
1132 return 0;
1133}
1134
1135/*
1136 * stage2_get_leaf_entry - walk the stage2 VM page tables and return
1137 * true if a valid and present leaf-entry is found. A pointer to the
1138 * leaf-entry is returned in the appropriate level variable - pudpp,
1139 * pmdpp, ptepp.
1140 */
1141static bool stage2_get_leaf_entry(struct kvm *kvm, phys_addr_t addr,
1142 pud_t **pudpp, pmd_t **pmdpp, pte_t **ptepp)
1087{ 1143{
1144 pud_t *pudp;
1088 pmd_t *pmdp; 1145 pmd_t *pmdp;
1089 pte_t *ptep; 1146 pte_t *ptep;
1090 1147
1091 pmdp = stage2_get_pmd(kvm, NULL, addr); 1148 *pudpp = NULL;
1149 *pmdpp = NULL;
1150 *ptepp = NULL;
1151
1152 pudp = stage2_get_pud(kvm, NULL, addr);
1153 if (!pudp || stage2_pud_none(kvm, *pudp) || !stage2_pud_present(kvm, *pudp))
1154 return false;
1155
1156 if (stage2_pud_huge(kvm, *pudp)) {
1157 *pudpp = pudp;
1158 return true;
1159 }
1160
1161 pmdp = stage2_pmd_offset(kvm, pudp, addr);
1092 if (!pmdp || pmd_none(*pmdp) || !pmd_present(*pmdp)) 1162 if (!pmdp || pmd_none(*pmdp) || !pmd_present(*pmdp))
1093 return false; 1163 return false;
1094 1164
1095 if (pmd_thp_or_huge(*pmdp)) 1165 if (pmd_thp_or_huge(*pmdp)) {
1096 return kvm_s2pmd_exec(pmdp); 1166 *pmdpp = pmdp;
1167 return true;
1168 }
1097 1169
1098 ptep = pte_offset_kernel(pmdp, addr); 1170 ptep = pte_offset_kernel(pmdp, addr);
1099 if (!ptep || pte_none(*ptep) || !pte_present(*ptep)) 1171 if (!ptep || pte_none(*ptep) || !pte_present(*ptep))
1100 return false; 1172 return false;
1101 1173
1102 return kvm_s2pte_exec(ptep); 1174 *ptepp = ptep;
1175 return true;
1176}
1177
1178static bool stage2_is_exec(struct kvm *kvm, phys_addr_t addr)
1179{
1180 pud_t *pudp;
1181 pmd_t *pmdp;
1182 pte_t *ptep;
1183 bool found;
1184
1185 found = stage2_get_leaf_entry(kvm, addr, &pudp, &pmdp, &ptep);
1186 if (!found)
1187 return false;
1188
1189 if (pudp)
1190 return kvm_s2pud_exec(pudp);
1191 else if (pmdp)
1192 return kvm_s2pmd_exec(pmdp);
1193 else
1194 return kvm_s2pte_exec(ptep);
1103} 1195}
1104 1196
1105static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, 1197static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
1106 phys_addr_t addr, const pte_t *new_pte, 1198 phys_addr_t addr, const pte_t *new_pte,
1107 unsigned long flags) 1199 unsigned long flags)
1108{ 1200{
1201 pud_t *pud;
1109 pmd_t *pmd; 1202 pmd_t *pmd;
1110 pte_t *pte, old_pte; 1203 pte_t *pte, old_pte;
1111 bool iomap = flags & KVM_S2PTE_FLAG_IS_IOMAP; 1204 bool iomap = flags & KVM_S2PTE_FLAG_IS_IOMAP;
@@ -1114,7 +1207,31 @@ static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
1114 VM_BUG_ON(logging_active && !cache); 1207 VM_BUG_ON(logging_active && !cache);
1115 1208
1116 /* Create stage-2 page table mapping - Levels 0 and 1 */ 1209 /* Create stage-2 page table mapping - Levels 0 and 1 */
1117 pmd = stage2_get_pmd(kvm, cache, addr); 1210 pud = stage2_get_pud(kvm, cache, addr);
1211 if (!pud) {
1212 /*
1213 * Ignore calls from kvm_set_spte_hva for unallocated
1214 * address ranges.
1215 */
1216 return 0;
1217 }
1218
1219 /*
1220 * While dirty page logging - dissolve huge PUD, then continue
1221 * on to allocate page.
1222 */
1223 if (logging_active)
1224 stage2_dissolve_pud(kvm, addr, pud);
1225
1226 if (stage2_pud_none(kvm, *pud)) {
1227 if (!cache)
1228 return 0; /* ignore calls from kvm_set_spte_hva */
1229 pmd = mmu_memory_cache_alloc(cache);
1230 stage2_pud_populate(kvm, pud, pmd);
1231 get_page(virt_to_page(pud));
1232 }
1233
1234 pmd = stage2_pmd_offset(kvm, pud, addr);
1118 if (!pmd) { 1235 if (!pmd) {
1119 /* 1236 /*
1120 * Ignore calls from kvm_set_spte_hva for unallocated 1237 * Ignore calls from kvm_set_spte_hva for unallocated
@@ -1182,6 +1299,11 @@ static int stage2_pmdp_test_and_clear_young(pmd_t *pmd)
1182 return stage2_ptep_test_and_clear_young((pte_t *)pmd); 1299 return stage2_ptep_test_and_clear_young((pte_t *)pmd);
1183} 1300}
1184 1301
1302static int stage2_pudp_test_and_clear_young(pud_t *pud)
1303{
1304 return stage2_ptep_test_and_clear_young((pte_t *)pud);
1305}
1306
1185/** 1307/**
1186 * kvm_phys_addr_ioremap - map a device range to guest IPA 1308 * kvm_phys_addr_ioremap - map a device range to guest IPA
1187 * 1309 *
@@ -1202,7 +1324,7 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
1202 pfn = __phys_to_pfn(pa); 1324 pfn = __phys_to_pfn(pa);
1203 1325
1204 for (addr = guest_ipa; addr < end; addr += PAGE_SIZE) { 1326 for (addr = guest_ipa; addr < end; addr += PAGE_SIZE) {
1205 pte_t pte = pfn_pte(pfn, PAGE_S2_DEVICE); 1327 pte_t pte = kvm_pfn_pte(pfn, PAGE_S2_DEVICE);
1206 1328
1207 if (writable) 1329 if (writable)
1208 pte = kvm_s2pte_mkwrite(pte); 1330 pte = kvm_s2pte_mkwrite(pte);
@@ -1234,7 +1356,7 @@ static bool transparent_hugepage_adjust(kvm_pfn_t *pfnp, phys_addr_t *ipap)
1234 struct page *page = pfn_to_page(pfn); 1356 struct page *page = pfn_to_page(pfn);
1235 1357
1236 /* 1358 /*
1237 * PageTransCompoungMap() returns true for THP and 1359 * PageTransCompoundMap() returns true for THP and
1238 * hugetlbfs. Make sure the adjustment is done only for THP 1360 * hugetlbfs. Make sure the adjustment is done only for THP
1239 * pages. 1361 * pages.
1240 */ 1362 */
@@ -1347,9 +1469,12 @@ static void stage2_wp_puds(struct kvm *kvm, pgd_t *pgd,
1347 do { 1469 do {
1348 next = stage2_pud_addr_end(kvm, addr, end); 1470 next = stage2_pud_addr_end(kvm, addr, end);
1349 if (!stage2_pud_none(kvm, *pud)) { 1471 if (!stage2_pud_none(kvm, *pud)) {
1350 /* TODO:PUD not supported, revisit later if supported */ 1472 if (stage2_pud_huge(kvm, *pud)) {
1351 BUG_ON(stage2_pud_huge(kvm, *pud)); 1473 if (!kvm_s2pud_readonly(pud))
1352 stage2_wp_pmds(kvm, pud, addr, next); 1474 kvm_set_s2pud_readonly(pud);
1475 } else {
1476 stage2_wp_pmds(kvm, pud, addr, next);
1477 }
1353 } 1478 }
1354 } while (pud++, addr = next, addr != end); 1479 } while (pud++, addr = next, addr != end);
1355} 1480}
@@ -1392,7 +1517,7 @@ static void stage2_wp_range(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
1392 * 1517 *
1393 * Called to start logging dirty pages after memory region 1518 * Called to start logging dirty pages after memory region
1394 * KVM_MEM_LOG_DIRTY_PAGES operation is called. After this function returns 1519 * KVM_MEM_LOG_DIRTY_PAGES operation is called. After this function returns
1395 * all present PMD and PTEs are write protected in the memory region. 1520 * all present PUD, PMD and PTEs are write protected in the memory region.
1396 * Afterwards read of dirty page log can be called. 1521 * Afterwards read of dirty page log can be called.
1397 * 1522 *
1398 * Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired, 1523 * Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired,
@@ -1470,12 +1595,70 @@ static void kvm_send_hwpoison_signal(unsigned long address,
1470 send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, current); 1595 send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, current);
1471} 1596}
1472 1597
1598static bool fault_supports_stage2_pmd_mappings(struct kvm_memory_slot *memslot,
1599 unsigned long hva)
1600{
1601 gpa_t gpa_start, gpa_end;
1602 hva_t uaddr_start, uaddr_end;
1603 size_t size;
1604
1605 size = memslot->npages * PAGE_SIZE;
1606
1607 gpa_start = memslot->base_gfn << PAGE_SHIFT;
1608 gpa_end = gpa_start + size;
1609
1610 uaddr_start = memslot->userspace_addr;
1611 uaddr_end = uaddr_start + size;
1612
1613 /*
1614 * Pages belonging to memslots that don't have the same alignment
1615 * within a PMD for userspace and IPA cannot be mapped with stage-2
1616 * PMD entries, because we'll end up mapping the wrong pages.
1617 *
1618 * Consider a layout like the following:
1619 *
1620 * memslot->userspace_addr:
1621 * +-----+--------------------+--------------------+---+
1622 * |abcde|fgh Stage-1 PMD | Stage-1 PMD tv|xyz|
1623 * +-----+--------------------+--------------------+---+
1624 *
1625 * memslot->base_gfn << PAGE_SIZE:
1626 * +---+--------------------+--------------------+-----+
1627 * |abc|def Stage-2 PMD | Stage-2 PMD |tvxyz|
1628 * +---+--------------------+--------------------+-----+
1629 *
1630 * If we create those stage-2 PMDs, we'll end up with this incorrect
1631 * mapping:
1632 * d -> f
1633 * e -> g
1634 * f -> h
1635 */
1636 if ((gpa_start & ~S2_PMD_MASK) != (uaddr_start & ~S2_PMD_MASK))
1637 return false;
1638
1639 /*
1640 * Next, let's make sure we're not trying to map anything not covered
1641 * by the memslot. This means we have to prohibit PMD size mappings
1642 * for the beginning and end of a non-PMD aligned and non-PMD sized
1643 * memory slot (illustrated by the head and tail parts of the
1644 * userspace view above containing pages 'abcde' and 'xyz',
1645 * respectively).
1646 *
1647 * Note that it doesn't matter if we do the check using the
1648 * userspace_addr or the base_gfn, as both are equally aligned (per
1649 * the check above) and equally sized.
1650 */
1651 return (hva & S2_PMD_MASK) >= uaddr_start &&
1652 (hva & S2_PMD_MASK) + S2_PMD_SIZE <= uaddr_end;
1653}
1654
1473static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, 1655static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
1474 struct kvm_memory_slot *memslot, unsigned long hva, 1656 struct kvm_memory_slot *memslot, unsigned long hva,
1475 unsigned long fault_status) 1657 unsigned long fault_status)
1476{ 1658{
1477 int ret; 1659 int ret;
1478 bool write_fault, exec_fault, writable, hugetlb = false, force_pte = false; 1660 bool write_fault, writable, force_pte = false;
1661 bool exec_fault, needs_exec;
1479 unsigned long mmu_seq; 1662 unsigned long mmu_seq;
1480 gfn_t gfn = fault_ipa >> PAGE_SHIFT; 1663 gfn_t gfn = fault_ipa >> PAGE_SHIFT;
1481 struct kvm *kvm = vcpu->kvm; 1664 struct kvm *kvm = vcpu->kvm;
@@ -1484,7 +1667,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
1484 kvm_pfn_t pfn; 1667 kvm_pfn_t pfn;
1485 pgprot_t mem_type = PAGE_S2; 1668 pgprot_t mem_type = PAGE_S2;
1486 bool logging_active = memslot_is_logging(memslot); 1669 bool logging_active = memslot_is_logging(memslot);
1487 unsigned long flags = 0; 1670 unsigned long vma_pagesize, flags = 0;
1488 1671
1489 write_fault = kvm_is_write_fault(vcpu); 1672 write_fault = kvm_is_write_fault(vcpu);
1490 exec_fault = kvm_vcpu_trap_is_iabt(vcpu); 1673 exec_fault = kvm_vcpu_trap_is_iabt(vcpu);
@@ -1495,6 +1678,12 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
1495 return -EFAULT; 1678 return -EFAULT;
1496 } 1679 }
1497 1680
1681 if (!fault_supports_stage2_pmd_mappings(memslot, hva))
1682 force_pte = true;
1683
1684 if (logging_active)
1685 force_pte = true;
1686
1498 /* Let's check if we will get back a huge page backed by hugetlbfs */ 1687 /* Let's check if we will get back a huge page backed by hugetlbfs */
1499 down_read(&current->mm->mmap_sem); 1688 down_read(&current->mm->mmap_sem);
1500 vma = find_vma_intersection(current->mm, hva, hva + 1); 1689 vma = find_vma_intersection(current->mm, hva, hva + 1);
@@ -1504,22 +1693,15 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
1504 return -EFAULT; 1693 return -EFAULT;
1505 } 1694 }
1506 1695
1507 if (vma_kernel_pagesize(vma) == PMD_SIZE && !logging_active) { 1696 vma_pagesize = vma_kernel_pagesize(vma);
1508 hugetlb = true; 1697 /*
1509 gfn = (fault_ipa & PMD_MASK) >> PAGE_SHIFT; 1698 * PUD level may not exist for a VM but PMD is guaranteed to
1510 } else { 1699 * exist.
1511 /* 1700 */
1512 * Pages belonging to memslots that don't have the same 1701 if ((vma_pagesize == PMD_SIZE ||
1513 * alignment for userspace and IPA cannot be mapped using 1702 (vma_pagesize == PUD_SIZE && kvm_stage2_has_pud(kvm))) &&
1514 * block descriptors even if the pages belong to a THP for 1703 !force_pte) {
1515 * the process, because the stage-2 block descriptor will 1704 gfn = (fault_ipa & huge_page_mask(hstate_vma(vma))) >> PAGE_SHIFT;
1516 * cover more than a single THP and we loose atomicity for
1517 * unmapping, updates, and splits of the THP or other pages
1518 * in the stage-2 block range.
1519 */
1520 if ((memslot->userspace_addr & ~PMD_MASK) !=
1521 ((memslot->base_gfn << PAGE_SHIFT) & ~PMD_MASK))
1522 force_pte = true;
1523 } 1705 }
1524 up_read(&current->mm->mmap_sem); 1706 up_read(&current->mm->mmap_sem);
1525 1707
@@ -1558,7 +1740,6 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
1558 * should not be mapped with huge pages (it introduces churn 1740 * should not be mapped with huge pages (it introduces churn
1559 * and performance degradation), so force a pte mapping. 1741 * and performance degradation), so force a pte mapping.
1560 */ 1742 */
1561 force_pte = true;
1562 flags |= KVM_S2_FLAG_LOGGING_ACTIVE; 1743 flags |= KVM_S2_FLAG_LOGGING_ACTIVE;
1563 1744
1564 /* 1745 /*
@@ -1573,50 +1754,69 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
1573 if (mmu_notifier_retry(kvm, mmu_seq)) 1754 if (mmu_notifier_retry(kvm, mmu_seq))
1574 goto out_unlock; 1755 goto out_unlock;
1575 1756
1576 if (!hugetlb && !force_pte) 1757 if (vma_pagesize == PAGE_SIZE && !force_pte) {
1577 hugetlb = transparent_hugepage_adjust(&pfn, &fault_ipa); 1758 /*
1759 * Only PMD_SIZE transparent hugepages(THP) are
1760 * currently supported. This code will need to be
1761 * updated to support other THP sizes.
1762 */
1763 if (transparent_hugepage_adjust(&pfn, &fault_ipa))
1764 vma_pagesize = PMD_SIZE;
1765 }
1578 1766
1579 if (hugetlb) { 1767 if (writable)
1580 pmd_t new_pmd = pfn_pmd(pfn, mem_type); 1768 kvm_set_pfn_dirty(pfn);
1581 new_pmd = pmd_mkhuge(new_pmd);
1582 if (writable) {
1583 new_pmd = kvm_s2pmd_mkwrite(new_pmd);
1584 kvm_set_pfn_dirty(pfn);
1585 }
1586 1769
1587 if (fault_status != FSC_PERM) 1770 if (fault_status != FSC_PERM)
1588 clean_dcache_guest_page(pfn, PMD_SIZE); 1771 clean_dcache_guest_page(pfn, vma_pagesize);
1589 1772
1590 if (exec_fault) { 1773 if (exec_fault)
1774 invalidate_icache_guest_page(pfn, vma_pagesize);
1775
1776 /*
1777 * If we took an execution fault we have made the
1778 * icache/dcache coherent above and should now let the s2
1779 * mapping be executable.
1780 *
1781 * Write faults (!exec_fault && FSC_PERM) are orthogonal to
1782 * execute permissions, and we preserve whatever we have.
1783 */
1784 needs_exec = exec_fault ||
1785 (fault_status == FSC_PERM && stage2_is_exec(kvm, fault_ipa));
1786
1787 if (vma_pagesize == PUD_SIZE) {
1788 pud_t new_pud = kvm_pfn_pud(pfn, mem_type);
1789
1790 new_pud = kvm_pud_mkhuge(new_pud);
1791 if (writable)
1792 new_pud = kvm_s2pud_mkwrite(new_pud);
1793
1794 if (needs_exec)
1795 new_pud = kvm_s2pud_mkexec(new_pud);
1796
1797 ret = stage2_set_pud_huge(kvm, memcache, fault_ipa, &new_pud);
1798 } else if (vma_pagesize == PMD_SIZE) {
1799 pmd_t new_pmd = kvm_pfn_pmd(pfn, mem_type);
1800
1801 new_pmd = kvm_pmd_mkhuge(new_pmd);
1802
1803 if (writable)
1804 new_pmd = kvm_s2pmd_mkwrite(new_pmd);
1805
1806 if (needs_exec)
1591 new_pmd = kvm_s2pmd_mkexec(new_pmd); 1807 new_pmd = kvm_s2pmd_mkexec(new_pmd);
1592 invalidate_icache_guest_page(pfn, PMD_SIZE);
1593 } else if (fault_status == FSC_PERM) {
1594 /* Preserve execute if XN was already cleared */
1595 if (stage2_is_exec(kvm, fault_ipa))
1596 new_pmd = kvm_s2pmd_mkexec(new_pmd);
1597 }
1598 1808
1599 ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd); 1809 ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd);
1600 } else { 1810 } else {
1601 pte_t new_pte = pfn_pte(pfn, mem_type); 1811 pte_t new_pte = kvm_pfn_pte(pfn, mem_type);
1602 1812
1603 if (writable) { 1813 if (writable) {
1604 new_pte = kvm_s2pte_mkwrite(new_pte); 1814 new_pte = kvm_s2pte_mkwrite(new_pte);
1605 kvm_set_pfn_dirty(pfn);
1606 mark_page_dirty(kvm, gfn); 1815 mark_page_dirty(kvm, gfn);
1607 } 1816 }
1608 1817
1609 if (fault_status != FSC_PERM) 1818 if (needs_exec)
1610 clean_dcache_guest_page(pfn, PAGE_SIZE);
1611
1612 if (exec_fault) {
1613 new_pte = kvm_s2pte_mkexec(new_pte); 1819 new_pte = kvm_s2pte_mkexec(new_pte);
1614 invalidate_icache_guest_page(pfn, PAGE_SIZE);
1615 } else if (fault_status == FSC_PERM) {
1616 /* Preserve execute if XN was already cleared */
1617 if (stage2_is_exec(kvm, fault_ipa))
1618 new_pte = kvm_s2pte_mkexec(new_pte);
1619 }
1620 1820
1621 ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, flags); 1821 ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, flags);
1622 } 1822 }
@@ -1637,6 +1837,7 @@ out_unlock:
1637 */ 1837 */
1638static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) 1838static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
1639{ 1839{
1840 pud_t *pud;
1640 pmd_t *pmd; 1841 pmd_t *pmd;
1641 pte_t *pte; 1842 pte_t *pte;
1642 kvm_pfn_t pfn; 1843 kvm_pfn_t pfn;
@@ -1646,24 +1847,23 @@ static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
1646 1847
1647 spin_lock(&vcpu->kvm->mmu_lock); 1848 spin_lock(&vcpu->kvm->mmu_lock);
1648 1849
1649 pmd = stage2_get_pmd(vcpu->kvm, NULL, fault_ipa); 1850 if (!stage2_get_leaf_entry(vcpu->kvm, fault_ipa, &pud, &pmd, &pte))
1650 if (!pmd || pmd_none(*pmd)) /* Nothing there */
1651 goto out; 1851 goto out;
1652 1852
1653 if (pmd_thp_or_huge(*pmd)) { /* THP, HugeTLB */ 1853 if (pud) { /* HugeTLB */
1854 *pud = kvm_s2pud_mkyoung(*pud);
1855 pfn = kvm_pud_pfn(*pud);
1856 pfn_valid = true;
1857 } else if (pmd) { /* THP, HugeTLB */
1654 *pmd = pmd_mkyoung(*pmd); 1858 *pmd = pmd_mkyoung(*pmd);
1655 pfn = pmd_pfn(*pmd); 1859 pfn = pmd_pfn(*pmd);
1656 pfn_valid = true; 1860 pfn_valid = true;
1657 goto out; 1861 } else {
1862 *pte = pte_mkyoung(*pte); /* Just a page... */
1863 pfn = pte_pfn(*pte);
1864 pfn_valid = true;
1658 } 1865 }
1659 1866
1660 pte = pte_offset_kernel(pmd, fault_ipa);
1661 if (pte_none(*pte)) /* Nothing there either */
1662 goto out;
1663
1664 *pte = pte_mkyoung(*pte); /* Just a page... */
1665 pfn = pte_pfn(*pte);
1666 pfn_valid = true;
1667out: 1867out:
1668 spin_unlock(&vcpu->kvm->mmu_lock); 1868 spin_unlock(&vcpu->kvm->mmu_lock);
1669 if (pfn_valid) 1869 if (pfn_valid)
@@ -1849,14 +2049,14 @@ static int kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data
1849} 2049}
1850 2050
1851 2051
1852void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) 2052int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
1853{ 2053{
1854 unsigned long end = hva + PAGE_SIZE; 2054 unsigned long end = hva + PAGE_SIZE;
1855 kvm_pfn_t pfn = pte_pfn(pte); 2055 kvm_pfn_t pfn = pte_pfn(pte);
1856 pte_t stage2_pte; 2056 pte_t stage2_pte;
1857 2057
1858 if (!kvm->arch.pgd) 2058 if (!kvm->arch.pgd)
1859 return; 2059 return 0;
1860 2060
1861 trace_kvm_set_spte_hva(hva); 2061 trace_kvm_set_spte_hva(hva);
1862 2062
@@ -1865,48 +2065,46 @@ void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
1865 * just like a translation fault and clean the cache to the PoC. 2065 * just like a translation fault and clean the cache to the PoC.
1866 */ 2066 */
1867 clean_dcache_guest_page(pfn, PAGE_SIZE); 2067 clean_dcache_guest_page(pfn, PAGE_SIZE);
1868 stage2_pte = pfn_pte(pfn, PAGE_S2); 2068 stage2_pte = kvm_pfn_pte(pfn, PAGE_S2);
1869 handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &stage2_pte); 2069 handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &stage2_pte);
2070
2071 return 0;
1870} 2072}
1871 2073
1872static int kvm_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data) 2074static int kvm_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
1873{ 2075{
2076 pud_t *pud;
1874 pmd_t *pmd; 2077 pmd_t *pmd;
1875 pte_t *pte; 2078 pte_t *pte;
1876 2079
1877 WARN_ON(size != PAGE_SIZE && size != PMD_SIZE); 2080 WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
1878 pmd = stage2_get_pmd(kvm, NULL, gpa); 2081 if (!stage2_get_leaf_entry(kvm, gpa, &pud, &pmd, &pte))
1879 if (!pmd || pmd_none(*pmd)) /* Nothing there */
1880 return 0; 2082 return 0;
1881 2083
1882 if (pmd_thp_or_huge(*pmd)) /* THP, HugeTLB */ 2084 if (pud)
2085 return stage2_pudp_test_and_clear_young(pud);
2086 else if (pmd)
1883 return stage2_pmdp_test_and_clear_young(pmd); 2087 return stage2_pmdp_test_and_clear_young(pmd);
1884 2088 else
1885 pte = pte_offset_kernel(pmd, gpa); 2089 return stage2_ptep_test_and_clear_young(pte);
1886 if (pte_none(*pte))
1887 return 0;
1888
1889 return stage2_ptep_test_and_clear_young(pte);
1890} 2090}
1891 2091
1892static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data) 2092static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
1893{ 2093{
2094 pud_t *pud;
1894 pmd_t *pmd; 2095 pmd_t *pmd;
1895 pte_t *pte; 2096 pte_t *pte;
1896 2097
1897 WARN_ON(size != PAGE_SIZE && size != PMD_SIZE); 2098 WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
1898 pmd = stage2_get_pmd(kvm, NULL, gpa); 2099 if (!stage2_get_leaf_entry(kvm, gpa, &pud, &pmd, &pte))
1899 if (!pmd || pmd_none(*pmd)) /* Nothing there */
1900 return 0; 2100 return 0;
1901 2101
1902 if (pmd_thp_or_huge(*pmd)) /* THP, HugeTLB */ 2102 if (pud)
2103 return kvm_s2pud_young(*pud);
2104 else if (pmd)
1903 return pmd_young(*pmd); 2105 return pmd_young(*pmd);
1904 2106 else
1905 pte = pte_offset_kernel(pmd, gpa);
1906 if (!pte_none(*pte)) /* Just a page... */
1907 return pte_young(*pte); 2107 return pte_young(*pte);
1908
1909 return 0;
1910} 2108}
1911 2109
1912int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) 2110int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
diff --git a/virt/kvm/arm/trace.h b/virt/kvm/arm/trace.h
index 57b3edebbb40..3828beab93f2 100644
--- a/virt/kvm/arm/trace.h
+++ b/virt/kvm/arm/trace.h
@@ -26,25 +26,25 @@ TRACE_EVENT(kvm_entry,
26); 26);
27 27
28TRACE_EVENT(kvm_exit, 28TRACE_EVENT(kvm_exit,
29 TP_PROTO(int idx, unsigned int exit_reason, unsigned long vcpu_pc), 29 TP_PROTO(int ret, unsigned int esr_ec, unsigned long vcpu_pc),
30 TP_ARGS(idx, exit_reason, vcpu_pc), 30 TP_ARGS(ret, esr_ec, vcpu_pc),
31 31
32 TP_STRUCT__entry( 32 TP_STRUCT__entry(
33 __field( int, idx ) 33 __field( int, ret )
34 __field( unsigned int, exit_reason ) 34 __field( unsigned int, esr_ec )
35 __field( unsigned long, vcpu_pc ) 35 __field( unsigned long, vcpu_pc )
36 ), 36 ),
37 37
38 TP_fast_assign( 38 TP_fast_assign(
39 __entry->idx = idx; 39 __entry->ret = ARM_EXCEPTION_CODE(ret);
40 __entry->exit_reason = exit_reason; 40 __entry->esr_ec = ARM_EXCEPTION_IS_TRAP(ret) ? esr_ec : 0;
41 __entry->vcpu_pc = vcpu_pc; 41 __entry->vcpu_pc = vcpu_pc;
42 ), 42 ),
43 43
44 TP_printk("%s: HSR_EC: 0x%04x (%s), PC: 0x%08lx", 44 TP_printk("%s: HSR_EC: 0x%04x (%s), PC: 0x%08lx",
45 __print_symbolic(__entry->idx, kvm_arm_exception_type), 45 __print_symbolic(__entry->ret, kvm_arm_exception_type),
46 __entry->exit_reason, 46 __entry->esr_ec,
47 __print_symbolic(__entry->exit_reason, kvm_arm_exception_class), 47 __print_symbolic(__entry->esr_ec, kvm_arm_exception_class),
48 __entry->vcpu_pc) 48 __entry->vcpu_pc)
49); 49);
50 50
diff --git a/virt/kvm/arm/vgic/vgic-mmio.c b/virt/kvm/arm/vgic/vgic-mmio.c
index f56ff1cf52ec..ceeda7e04a4d 100644
--- a/virt/kvm/arm/vgic/vgic-mmio.c
+++ b/virt/kvm/arm/vgic/vgic-mmio.c
@@ -313,36 +313,30 @@ static void vgic_mmio_change_active(struct kvm_vcpu *vcpu, struct vgic_irq *irq,
313 313
314 spin_lock_irqsave(&irq->irq_lock, flags); 314 spin_lock_irqsave(&irq->irq_lock, flags);
315 315
316 /*
317 * If this virtual IRQ was written into a list register, we
318 * have to make sure the CPU that runs the VCPU thread has
319 * synced back the LR state to the struct vgic_irq.
320 *
321 * As long as the conditions below are true, we know the VCPU thread
322 * may be on its way back from the guest (we kicked the VCPU thread in
323 * vgic_change_active_prepare) and still has to sync back this IRQ,
324 * so we release and re-acquire the spin_lock to let the other thread
325 * sync back the IRQ.
326 *
327 * When accessing VGIC state from user space, requester_vcpu is
328 * NULL, which is fine, because we guarantee that no VCPUs are running
329 * when accessing VGIC state from user space so irq->vcpu->cpu is
330 * always -1.
331 */
332 while (irq->vcpu && /* IRQ may have state in an LR somewhere */
333 irq->vcpu != requester_vcpu && /* Current thread is not the VCPU thread */
334 irq->vcpu->cpu != -1) /* VCPU thread is running */
335 cond_resched_lock(&irq->irq_lock);
336
337 if (irq->hw) { 316 if (irq->hw) {
338 vgic_hw_irq_change_active(vcpu, irq, active, !requester_vcpu); 317 vgic_hw_irq_change_active(vcpu, irq, active, !requester_vcpu);
339 } else { 318 } else {
340 u32 model = vcpu->kvm->arch.vgic.vgic_model; 319 u32 model = vcpu->kvm->arch.vgic.vgic_model;
320 u8 active_source;
341 321
342 irq->active = active; 322 irq->active = active;
323
324 /*
325 * The GICv2 architecture indicates that the source CPUID for
326 * an SGI should be provided during an EOI which implies that
327 * the active state is stored somewhere, but at the same time
328 * this state is not architecturally exposed anywhere and we
329 * have no way of knowing the right source.
330 *
331 * This may lead to a VCPU not being able to receive
332 * additional instances of a particular SGI after migration
333 * for a GICv2 VM on some GIC implementations. Oh well.
334 */
335 active_source = (requester_vcpu) ? requester_vcpu->vcpu_id : 0;
336
343 if (model == KVM_DEV_TYPE_ARM_VGIC_V2 && 337 if (model == KVM_DEV_TYPE_ARM_VGIC_V2 &&
344 active && vgic_irq_is_sgi(irq->intid)) 338 active && vgic_irq_is_sgi(irq->intid))
345 irq->active_source = requester_vcpu->vcpu_id; 339 irq->active_source = active_source;
346 } 340 }
347 341
348 if (irq->active) 342 if (irq->active)
@@ -368,14 +362,16 @@ static void vgic_mmio_change_active(struct kvm_vcpu *vcpu, struct vgic_irq *irq,
368 */ 362 */
369static void vgic_change_active_prepare(struct kvm_vcpu *vcpu, u32 intid) 363static void vgic_change_active_prepare(struct kvm_vcpu *vcpu, u32 intid)
370{ 364{
371 if (intid > VGIC_NR_PRIVATE_IRQS) 365 if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3 ||
366 intid > VGIC_NR_PRIVATE_IRQS)
372 kvm_arm_halt_guest(vcpu->kvm); 367 kvm_arm_halt_guest(vcpu->kvm);
373} 368}
374 369
375/* See vgic_change_active_prepare */ 370/* See vgic_change_active_prepare */
376static void vgic_change_active_finish(struct kvm_vcpu *vcpu, u32 intid) 371static void vgic_change_active_finish(struct kvm_vcpu *vcpu, u32 intid)
377{ 372{
378 if (intid > VGIC_NR_PRIVATE_IRQS) 373 if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3 ||
374 intid > VGIC_NR_PRIVATE_IRQS)
379 kvm_arm_resume_guest(vcpu->kvm); 375 kvm_arm_resume_guest(vcpu->kvm);
380} 376}
381 377
diff --git a/virt/kvm/arm/vgic/vgic.c b/virt/kvm/arm/vgic/vgic.c
index 7cfdfbc910e0..a6b135491b6c 100644
--- a/virt/kvm/arm/vgic/vgic.c
+++ b/virt/kvm/arm/vgic/vgic.c
@@ -103,13 +103,13 @@ struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu,
103{ 103{
104 /* SGIs and PPIs */ 104 /* SGIs and PPIs */
105 if (intid <= VGIC_MAX_PRIVATE) { 105 if (intid <= VGIC_MAX_PRIVATE) {
106 intid = array_index_nospec(intid, VGIC_MAX_PRIVATE); 106 intid = array_index_nospec(intid, VGIC_MAX_PRIVATE + 1);
107 return &vcpu->arch.vgic_cpu.private_irqs[intid]; 107 return &vcpu->arch.vgic_cpu.private_irqs[intid];
108 } 108 }
109 109
110 /* SPIs */ 110 /* SPIs */
111 if (intid <= VGIC_MAX_SPI) { 111 if (intid < (kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS)) {
112 intid = array_index_nospec(intid, VGIC_MAX_SPI); 112 intid = array_index_nospec(intid, kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS);
113 return &kvm->arch.vgic.spis[intid - VGIC_NR_PRIVATE_IRQS]; 113 return &kvm->arch.vgic.spis[intid - VGIC_NR_PRIVATE_IRQS];
114 } 114 }
115 115
@@ -908,6 +908,7 @@ int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu)
908 struct vgic_irq *irq; 908 struct vgic_irq *irq;
909 bool pending = false; 909 bool pending = false;
910 unsigned long flags; 910 unsigned long flags;
911 struct vgic_vmcr vmcr;
911 912
912 if (!vcpu->kvm->arch.vgic.enabled) 913 if (!vcpu->kvm->arch.vgic.enabled)
913 return false; 914 return false;
@@ -915,11 +916,15 @@ int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu)
915 if (vcpu->arch.vgic_cpu.vgic_v3.its_vpe.pending_last) 916 if (vcpu->arch.vgic_cpu.vgic_v3.its_vpe.pending_last)
916 return true; 917 return true;
917 918
919 vgic_get_vmcr(vcpu, &vmcr);
920
918 spin_lock_irqsave(&vgic_cpu->ap_list_lock, flags); 921 spin_lock_irqsave(&vgic_cpu->ap_list_lock, flags);
919 922
920 list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) { 923 list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) {
921 spin_lock(&irq->irq_lock); 924 spin_lock(&irq->irq_lock);
922 pending = irq_is_pending(irq) && irq->enabled; 925 pending = irq_is_pending(irq) && irq->enabled &&
926 !irq->active &&
927 irq->priority < vmcr.pmr;
923 spin_unlock(&irq->irq_lock); 928 spin_unlock(&irq->irq_lock);
924 929
925 if (pending) 930 if (pending)
diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
index 23c2519c5b32..110cbe3f74f8 100644
--- a/virt/kvm/async_pf.c
+++ b/virt/kvm/async_pf.c
@@ -82,7 +82,7 @@ static void async_pf_execute(struct work_struct *work)
82 might_sleep(); 82 might_sleep();
83 83
84 /* 84 /*
85 * This work is run asynchromously to the task which owns 85 * This work is run asynchronously to the task which owns
86 * mm and might be done in another context, so we must 86 * mm and might be done in another context, so we must
87 * access remotely. 87 * access remotely.
88 */ 88 */
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 2679e476b6c3..cf7cc0554094 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -354,7 +354,10 @@ static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
354 idx = srcu_read_lock(&kvm->srcu); 354 idx = srcu_read_lock(&kvm->srcu);
355 spin_lock(&kvm->mmu_lock); 355 spin_lock(&kvm->mmu_lock);
356 kvm->mmu_notifier_seq++; 356 kvm->mmu_notifier_seq++;
357 kvm_set_spte_hva(kvm, address, pte); 357
358 if (kvm_set_spte_hva(kvm, address, pte))
359 kvm_flush_remote_tlbs(kvm);
360
358 spin_unlock(&kvm->mmu_lock); 361 spin_unlock(&kvm->mmu_lock);
359 srcu_read_unlock(&kvm->srcu, idx); 362 srcu_read_unlock(&kvm->srcu, idx);
360} 363}
@@ -1133,7 +1136,7 @@ EXPORT_SYMBOL_GPL(kvm_get_dirty_log);
1133#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT 1136#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
1134/** 1137/**
1135 * kvm_get_dirty_log_protect - get a snapshot of dirty pages, and if any pages 1138 * kvm_get_dirty_log_protect - get a snapshot of dirty pages, and if any pages
1136 * are dirty write protect them for next write. 1139 * and reenable dirty page tracking for the corresponding pages.
1137 * @kvm: pointer to kvm instance 1140 * @kvm: pointer to kvm instance
1138 * @log: slot id and address to which we copy the log 1141 * @log: slot id and address to which we copy the log
1139 * @is_dirty: flag set if any page is dirty 1142 * @is_dirty: flag set if any page is dirty
@@ -1154,7 +1157,7 @@ EXPORT_SYMBOL_GPL(kvm_get_dirty_log);
1154 * 1157 *
1155 */ 1158 */
1156int kvm_get_dirty_log_protect(struct kvm *kvm, 1159int kvm_get_dirty_log_protect(struct kvm *kvm,
1157 struct kvm_dirty_log *log, bool *is_dirty) 1160 struct kvm_dirty_log *log, bool *flush)
1158{ 1161{
1159 struct kvm_memslots *slots; 1162 struct kvm_memslots *slots;
1160 struct kvm_memory_slot *memslot; 1163 struct kvm_memory_slot *memslot;
@@ -1176,37 +1179,114 @@ int kvm_get_dirty_log_protect(struct kvm *kvm,
1176 return -ENOENT; 1179 return -ENOENT;
1177 1180
1178 n = kvm_dirty_bitmap_bytes(memslot); 1181 n = kvm_dirty_bitmap_bytes(memslot);
1182 *flush = false;
1183 if (kvm->manual_dirty_log_protect) {
1184 /*
1185 * Unlike kvm_get_dirty_log, we always return false in *flush,
1186 * because no flush is needed until KVM_CLEAR_DIRTY_LOG. There
1187 * is some code duplication between this function and
1188 * kvm_get_dirty_log, but hopefully all architecture
1189 * transition to kvm_get_dirty_log_protect and kvm_get_dirty_log
1190 * can be eliminated.
1191 */
1192 dirty_bitmap_buffer = dirty_bitmap;
1193 } else {
1194 dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
1195 memset(dirty_bitmap_buffer, 0, n);
1196
1197 spin_lock(&kvm->mmu_lock);
1198 for (i = 0; i < n / sizeof(long); i++) {
1199 unsigned long mask;
1200 gfn_t offset;
1201
1202 if (!dirty_bitmap[i])
1203 continue;
1204
1205 *flush = true;
1206 mask = xchg(&dirty_bitmap[i], 0);
1207 dirty_bitmap_buffer[i] = mask;
1208
1209 if (mask) {
1210 offset = i * BITS_PER_LONG;
1211 kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
1212 offset, mask);
1213 }
1214 }
1215 spin_unlock(&kvm->mmu_lock);
1216 }
1217
1218 if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))
1219 return -EFAULT;
1220 return 0;
1221}
1222EXPORT_SYMBOL_GPL(kvm_get_dirty_log_protect);
1179 1223
1224/**
1225 * kvm_clear_dirty_log_protect - clear dirty bits in the bitmap
1226 * and reenable dirty page tracking for the corresponding pages.
1227 * @kvm: pointer to kvm instance
1228 * @log: slot id and address from which to fetch the bitmap of dirty pages
1229 */
1230int kvm_clear_dirty_log_protect(struct kvm *kvm,
1231 struct kvm_clear_dirty_log *log, bool *flush)
1232{
1233 struct kvm_memslots *slots;
1234 struct kvm_memory_slot *memslot;
1235 int as_id, id, n;
1236 gfn_t offset;
1237 unsigned long i;
1238 unsigned long *dirty_bitmap;
1239 unsigned long *dirty_bitmap_buffer;
1240
1241 as_id = log->slot >> 16;
1242 id = (u16)log->slot;
1243 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
1244 return -EINVAL;
1245
1246 if ((log->first_page & 63) || (log->num_pages & 63))
1247 return -EINVAL;
1248
1249 slots = __kvm_memslots(kvm, as_id);
1250 memslot = id_to_memslot(slots, id);
1251
1252 dirty_bitmap = memslot->dirty_bitmap;
1253 if (!dirty_bitmap)
1254 return -ENOENT;
1255
1256 n = kvm_dirty_bitmap_bytes(memslot);
1257 *flush = false;
1180 dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot); 1258 dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
1181 memset(dirty_bitmap_buffer, 0, n); 1259 if (copy_from_user(dirty_bitmap_buffer, log->dirty_bitmap, n))
1260 return -EFAULT;
1182 1261
1183 spin_lock(&kvm->mmu_lock); 1262 spin_lock(&kvm->mmu_lock);
1184 *is_dirty = false; 1263 for (offset = log->first_page,
1185 for (i = 0; i < n / sizeof(long); i++) { 1264 i = offset / BITS_PER_LONG, n = log->num_pages / BITS_PER_LONG; n--;
1186 unsigned long mask; 1265 i++, offset += BITS_PER_LONG) {
1187 gfn_t offset; 1266 unsigned long mask = *dirty_bitmap_buffer++;
1188 1267 atomic_long_t *p = (atomic_long_t *) &dirty_bitmap[i];
1189 if (!dirty_bitmap[i]) 1268 if (!mask)
1190 continue; 1269 continue;
1191 1270
1192 *is_dirty = true; 1271 mask &= atomic_long_fetch_andnot(mask, p);
1193
1194 mask = xchg(&dirty_bitmap[i], 0);
1195 dirty_bitmap_buffer[i] = mask;
1196 1272
1273 /*
1274 * mask contains the bits that really have been cleared. This
1275 * never includes any bits beyond the length of the memslot (if
1276 * the length is not aligned to 64 pages), therefore it is not
1277 * a problem if userspace sets them in log->dirty_bitmap.
1278 */
1197 if (mask) { 1279 if (mask) {
1198 offset = i * BITS_PER_LONG; 1280 *flush = true;
1199 kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot, 1281 kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
1200 offset, mask); 1282 offset, mask);
1201 } 1283 }
1202 } 1284 }
1203
1204 spin_unlock(&kvm->mmu_lock); 1285 spin_unlock(&kvm->mmu_lock);
1205 if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n)) 1286
1206 return -EFAULT;
1207 return 0; 1287 return 0;
1208} 1288}
1209EXPORT_SYMBOL_GPL(kvm_get_dirty_log_protect); 1289EXPORT_SYMBOL_GPL(kvm_clear_dirty_log_protect);
1210#endif 1290#endif
1211 1291
1212bool kvm_largepages_enabled(void) 1292bool kvm_largepages_enabled(void)
@@ -1928,32 +2008,33 @@ static int __kvm_gfn_to_hva_cache_init(struct kvm_memslots *slots,
1928 gfn_t end_gfn = (gpa + len - 1) >> PAGE_SHIFT; 2008 gfn_t end_gfn = (gpa + len - 1) >> PAGE_SHIFT;
1929 gfn_t nr_pages_needed = end_gfn - start_gfn + 1; 2009 gfn_t nr_pages_needed = end_gfn - start_gfn + 1;
1930 gfn_t nr_pages_avail; 2010 gfn_t nr_pages_avail;
2011 int r = start_gfn <= end_gfn ? 0 : -EINVAL;
1931 2012
1932 ghc->gpa = gpa; 2013 ghc->gpa = gpa;
1933 ghc->generation = slots->generation; 2014 ghc->generation = slots->generation;
1934 ghc->len = len; 2015 ghc->len = len;
1935 ghc->memslot = __gfn_to_memslot(slots, start_gfn); 2016 ghc->hva = KVM_HVA_ERR_BAD;
1936 ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn, NULL); 2017
1937 if (!kvm_is_error_hva(ghc->hva) && nr_pages_needed <= 1) { 2018 /*
2019 * If the requested region crosses two memslots, we still
2020 * verify that the entire region is valid here.
2021 */
2022 while (!r && start_gfn <= end_gfn) {
2023 ghc->memslot = __gfn_to_memslot(slots, start_gfn);
2024 ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn,
2025 &nr_pages_avail);
2026 if (kvm_is_error_hva(ghc->hva))
2027 r = -EFAULT;
2028 start_gfn += nr_pages_avail;
2029 }
2030
2031 /* Use the slow path for cross page reads and writes. */
2032 if (!r && nr_pages_needed == 1)
1938 ghc->hva += offset; 2033 ghc->hva += offset;
1939 } else { 2034 else
1940 /*
1941 * If the requested region crosses two memslots, we still
1942 * verify that the entire region is valid here.
1943 */
1944 while (start_gfn <= end_gfn) {
1945 nr_pages_avail = 0;
1946 ghc->memslot = __gfn_to_memslot(slots, start_gfn);
1947 ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn,
1948 &nr_pages_avail);
1949 if (kvm_is_error_hva(ghc->hva))
1950 return -EFAULT;
1951 start_gfn += nr_pages_avail;
1952 }
1953 /* Use the slow path for cross page reads and writes. */
1954 ghc->memslot = NULL; 2035 ghc->memslot = NULL;
1955 } 2036
1956 return 0; 2037 return r;
1957} 2038}
1958 2039
1959int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, 2040int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
@@ -1965,7 +2046,8 @@ int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
1965EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init); 2046EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init);
1966 2047
1967int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, 2048int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
1968 void *data, int offset, unsigned long len) 2049 void *data, unsigned int offset,
2050 unsigned long len)
1969{ 2051{
1970 struct kvm_memslots *slots = kvm_memslots(kvm); 2052 struct kvm_memslots *slots = kvm_memslots(kvm);
1971 int r; 2053 int r;
@@ -2948,6 +3030,10 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
2948#endif 3030#endif
2949 case KVM_CAP_IOEVENTFD_ANY_LENGTH: 3031 case KVM_CAP_IOEVENTFD_ANY_LENGTH:
2950 case KVM_CAP_CHECK_EXTENSION_VM: 3032 case KVM_CAP_CHECK_EXTENSION_VM:
3033 case KVM_CAP_ENABLE_CAP_VM:
3034#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
3035 case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT:
3036#endif
2951 return 1; 3037 return 1;
2952#ifdef CONFIG_KVM_MMIO 3038#ifdef CONFIG_KVM_MMIO
2953 case KVM_CAP_COALESCED_MMIO: 3039 case KVM_CAP_COALESCED_MMIO:
@@ -2971,6 +3057,28 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
2971 return kvm_vm_ioctl_check_extension(kvm, arg); 3057 return kvm_vm_ioctl_check_extension(kvm, arg);
2972} 3058}
2973 3059
3060int __attribute__((weak)) kvm_vm_ioctl_enable_cap(struct kvm *kvm,
3061 struct kvm_enable_cap *cap)
3062{
3063 return -EINVAL;
3064}
3065
3066static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm,
3067 struct kvm_enable_cap *cap)
3068{
3069 switch (cap->cap) {
3070#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
3071 case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT:
3072 if (cap->flags || (cap->args[0] & ~1))
3073 return -EINVAL;
3074 kvm->manual_dirty_log_protect = cap->args[0];
3075 return 0;
3076#endif
3077 default:
3078 return kvm_vm_ioctl_enable_cap(kvm, cap);
3079 }
3080}
3081
2974static long kvm_vm_ioctl(struct file *filp, 3082static long kvm_vm_ioctl(struct file *filp,
2975 unsigned int ioctl, unsigned long arg) 3083 unsigned int ioctl, unsigned long arg)
2976{ 3084{
@@ -2984,6 +3092,15 @@ static long kvm_vm_ioctl(struct file *filp,
2984 case KVM_CREATE_VCPU: 3092 case KVM_CREATE_VCPU:
2985 r = kvm_vm_ioctl_create_vcpu(kvm, arg); 3093 r = kvm_vm_ioctl_create_vcpu(kvm, arg);
2986 break; 3094 break;
3095 case KVM_ENABLE_CAP: {
3096 struct kvm_enable_cap cap;
3097
3098 r = -EFAULT;
3099 if (copy_from_user(&cap, argp, sizeof(cap)))
3100 goto out;
3101 r = kvm_vm_ioctl_enable_cap_generic(kvm, &cap);
3102 break;
3103 }
2987 case KVM_SET_USER_MEMORY_REGION: { 3104 case KVM_SET_USER_MEMORY_REGION: {
2988 struct kvm_userspace_memory_region kvm_userspace_mem; 3105 struct kvm_userspace_memory_region kvm_userspace_mem;
2989 3106
@@ -3004,6 +3121,17 @@ static long kvm_vm_ioctl(struct file *filp,
3004 r = kvm_vm_ioctl_get_dirty_log(kvm, &log); 3121 r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
3005 break; 3122 break;
3006 } 3123 }
3124#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
3125 case KVM_CLEAR_DIRTY_LOG: {
3126 struct kvm_clear_dirty_log log;
3127
3128 r = -EFAULT;
3129 if (copy_from_user(&log, argp, sizeof(log)))
3130 goto out;
3131 r = kvm_vm_ioctl_clear_dirty_log(kvm, &log);
3132 break;
3133 }
3134#endif
3007#ifdef CONFIG_KVM_MMIO 3135#ifdef CONFIG_KVM_MMIO
3008 case KVM_REGISTER_COALESCED_MMIO: { 3136 case KVM_REGISTER_COALESCED_MMIO: {
3009 struct kvm_coalesced_mmio_zone zone; 3137 struct kvm_coalesced_mmio_zone zone;