diff options
119 files changed, 19024 insertions, 16329 deletions
diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index cd209f7730af..356156f5c52d 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt | |||
| @@ -305,6 +305,9 @@ the address space for which you want to return the dirty bitmap. | |||
| 305 | They must be less than the value that KVM_CHECK_EXTENSION returns for | 305 | They must be less than the value that KVM_CHECK_EXTENSION returns for |
| 306 | the KVM_CAP_MULTI_ADDRESS_SPACE capability. | 306 | the KVM_CAP_MULTI_ADDRESS_SPACE capability. |
| 307 | 307 | ||
| 308 | The bits in the dirty bitmap are cleared before the ioctl returns, unless | ||
| 309 | KVM_CAP_MANUAL_DIRTY_LOG_PROTECT is enabled. For more information, | ||
| 310 | see the description of the capability. | ||
| 308 | 311 | ||
| 309 | 4.9 KVM_SET_MEMORY_ALIAS | 312 | 4.9 KVM_SET_MEMORY_ALIAS |
| 310 | 313 | ||
| @@ -1129,10 +1132,15 @@ documentation when it pops into existence). | |||
| 1129 | 1132 | ||
| 1130 | 4.37 KVM_ENABLE_CAP | 1133 | 4.37 KVM_ENABLE_CAP |
| 1131 | 1134 | ||
| 1132 | Capability: KVM_CAP_ENABLE_CAP, KVM_CAP_ENABLE_CAP_VM | 1135 | Capability: KVM_CAP_ENABLE_CAP |
| 1133 | Architectures: x86 (only KVM_CAP_ENABLE_CAP_VM), | 1136 | Architectures: mips, ppc, s390 |
| 1134 | mips (only KVM_CAP_ENABLE_CAP), ppc, s390 | 1137 | Type: vcpu ioctl |
| 1135 | Type: vcpu ioctl, vm ioctl (with KVM_CAP_ENABLE_CAP_VM) | 1138 | Parameters: struct kvm_enable_cap (in) |
| 1139 | Returns: 0 on success; -1 on error | ||
| 1140 | |||
| 1141 | Capability: KVM_CAP_ENABLE_CAP_VM | ||
| 1142 | Architectures: all | ||
| 1143 | Type: vcpu ioctl | ||
| 1136 | Parameters: struct kvm_enable_cap (in) | 1144 | Parameters: struct kvm_enable_cap (in) |
| 1137 | Returns: 0 on success; -1 on error | 1145 | Returns: 0 on success; -1 on error |
| 1138 | 1146 | ||
| @@ -3753,6 +3761,102 @@ Coalesced pio is based on coalesced mmio. There is little difference | |||
| 3753 | between coalesced mmio and pio except that coalesced pio records accesses | 3761 | between coalesced mmio and pio except that coalesced pio records accesses |
| 3754 | to I/O ports. | 3762 | to I/O ports. |
| 3755 | 3763 | ||
| 3764 | 4.117 KVM_CLEAR_DIRTY_LOG (vm ioctl) | ||
| 3765 | |||
| 3766 | Capability: KVM_CAP_MANUAL_DIRTY_LOG_PROTECT | ||
| 3767 | Architectures: x86 | ||
| 3768 | Type: vm ioctl | ||
| 3769 | Parameters: struct kvm_dirty_log (in) | ||
| 3770 | Returns: 0 on success, -1 on error | ||
| 3771 | |||
| 3772 | /* for KVM_CLEAR_DIRTY_LOG */ | ||
| 3773 | struct kvm_clear_dirty_log { | ||
| 3774 | __u32 slot; | ||
| 3775 | __u32 num_pages; | ||
| 3776 | __u64 first_page; | ||
| 3777 | union { | ||
| 3778 | void __user *dirty_bitmap; /* one bit per page */ | ||
| 3779 | __u64 padding; | ||
| 3780 | }; | ||
| 3781 | }; | ||
| 3782 | |||
| 3783 | The ioctl clears the dirty status of pages in a memory slot, according to | ||
| 3784 | the bitmap that is passed in struct kvm_clear_dirty_log's dirty_bitmap | ||
| 3785 | field. Bit 0 of the bitmap corresponds to page "first_page" in the | ||
| 3786 | memory slot, and num_pages is the size in bits of the input bitmap. | ||
| 3787 | Both first_page and num_pages must be a multiple of 64. For each bit | ||
| 3788 | that is set in the input bitmap, the corresponding page is marked "clean" | ||
| 3789 | in KVM's dirty bitmap, and dirty tracking is re-enabled for that page | ||
| 3790 | (for example via write-protection, or by clearing the dirty bit in | ||
| 3791 | a page table entry). | ||
| 3792 | |||
| 3793 | If KVM_CAP_MULTI_ADDRESS_SPACE is available, bits 16-31 specifies | ||
| 3794 | the address space for which you want to return the dirty bitmap. | ||
| 3795 | They must be less than the value that KVM_CHECK_EXTENSION returns for | ||
| 3796 | the KVM_CAP_MULTI_ADDRESS_SPACE capability. | ||
| 3797 | |||
| 3798 | This ioctl is mostly useful when KVM_CAP_MANUAL_DIRTY_LOG_PROTECT | ||
| 3799 | is enabled; for more information, see the description of the capability. | ||
| 3800 | However, it can always be used as long as KVM_CHECK_EXTENSION confirms | ||
| 3801 | that KVM_CAP_MANUAL_DIRTY_LOG_PROTECT is present. | ||
| 3802 | |||
| 3803 | 4.118 KVM_GET_SUPPORTED_HV_CPUID | ||
| 3804 | |||
| 3805 | Capability: KVM_CAP_HYPERV_CPUID | ||
| 3806 | Architectures: x86 | ||
| 3807 | Type: vcpu ioctl | ||
| 3808 | Parameters: struct kvm_cpuid2 (in/out) | ||
| 3809 | Returns: 0 on success, -1 on error | ||
| 3810 | |||
| 3811 | struct kvm_cpuid2 { | ||
| 3812 | __u32 nent; | ||
| 3813 | __u32 padding; | ||
| 3814 | struct kvm_cpuid_entry2 entries[0]; | ||
| 3815 | }; | ||
| 3816 | |||
| 3817 | struct kvm_cpuid_entry2 { | ||
| 3818 | __u32 function; | ||
| 3819 | __u32 index; | ||
| 3820 | __u32 flags; | ||
| 3821 | __u32 eax; | ||
| 3822 | __u32 ebx; | ||
| 3823 | __u32 ecx; | ||
| 3824 | __u32 edx; | ||
| 3825 | __u32 padding[3]; | ||
| 3826 | }; | ||
| 3827 | |||
| 3828 | This ioctl returns x86 cpuid features leaves related to Hyper-V emulation in | ||
| 3829 | KVM. Userspace can use the information returned by this ioctl to construct | ||
| 3830 | cpuid information presented to guests consuming Hyper-V enlightenments (e.g. | ||
| 3831 | Windows or Hyper-V guests). | ||
| 3832 | |||
| 3833 | CPUID feature leaves returned by this ioctl are defined by Hyper-V Top Level | ||
| 3834 | Functional Specification (TLFS). These leaves can't be obtained with | ||
| 3835 | KVM_GET_SUPPORTED_CPUID ioctl because some of them intersect with KVM feature | ||
| 3836 | leaves (0x40000000, 0x40000001). | ||
| 3837 | |||
| 3838 | Currently, the following list of CPUID leaves are returned: | ||
| 3839 | HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS | ||
| 3840 | HYPERV_CPUID_INTERFACE | ||
| 3841 | HYPERV_CPUID_VERSION | ||
| 3842 | HYPERV_CPUID_FEATURES | ||
| 3843 | HYPERV_CPUID_ENLIGHTMENT_INFO | ||
| 3844 | HYPERV_CPUID_IMPLEMENT_LIMITS | ||
| 3845 | HYPERV_CPUID_NESTED_FEATURES | ||
| 3846 | |||
| 3847 | HYPERV_CPUID_NESTED_FEATURES leaf is only exposed when Enlightened VMCS was | ||
| 3848 | enabled on the corresponding vCPU (KVM_CAP_HYPERV_ENLIGHTENED_VMCS). | ||
| 3849 | |||
| 3850 | Userspace invokes KVM_GET_SUPPORTED_CPUID by passing a kvm_cpuid2 structure | ||
| 3851 | with the 'nent' field indicating the number of entries in the variable-size | ||
| 3852 | array 'entries'. If the number of entries is too low to describe all Hyper-V | ||
| 3853 | feature leaves, an error (E2BIG) is returned. If the number is more or equal | ||
| 3854 | to the number of Hyper-V feature leaves, the 'nent' field is adjusted to the | ||
| 3855 | number of valid entries in the 'entries' array, which is then filled. | ||
| 3856 | |||
| 3857 | 'index' and 'flags' fields in 'struct kvm_cpuid_entry2' are currently reserved, | ||
| 3858 | userspace should not expect to get any particular value there. | ||
| 3859 | |||
| 3756 | 5. The kvm_run structure | 3860 | 5. The kvm_run structure |
| 3757 | ------------------------ | 3861 | ------------------------ |
| 3758 | 3862 | ||
| @@ -4647,6 +4751,30 @@ and injected exceptions. | |||
| 4647 | * For the new DR6 bits, note that bit 16 is set iff the #DB exception | 4751 | * For the new DR6 bits, note that bit 16 is set iff the #DB exception |
| 4648 | will clear DR6.RTM. | 4752 | will clear DR6.RTM. |
| 4649 | 4753 | ||
| 4754 | 7.18 KVM_CAP_MANUAL_DIRTY_LOG_PROTECT | ||
| 4755 | |||
| 4756 | Architectures: all | ||
| 4757 | Parameters: args[0] whether feature should be enabled or not | ||
| 4758 | |||
| 4759 | With this capability enabled, KVM_GET_DIRTY_LOG will not automatically | ||
| 4760 | clear and write-protect all pages that are returned as dirty. | ||
| 4761 | Rather, userspace will have to do this operation separately using | ||
| 4762 | KVM_CLEAR_DIRTY_LOG. | ||
| 4763 | |||
| 4764 | At the cost of a slightly more complicated operation, this provides better | ||
| 4765 | scalability and responsiveness for two reasons. First, | ||
| 4766 | KVM_CLEAR_DIRTY_LOG ioctl can operate on a 64-page granularity rather | ||
| 4767 | than requiring to sync a full memslot; this ensures that KVM does not | ||
| 4768 | take spinlocks for an extended period of time. Second, in some cases a | ||
| 4769 | large amount of time can pass between a call to KVM_GET_DIRTY_LOG and | ||
| 4770 | userspace actually using the data in the page. Pages can be modified | ||
| 4771 | during this time, which is inefficint for both the guest and userspace: | ||
| 4772 | the guest will incur a higher penalty due to write protection faults, | ||
| 4773 | while userspace can see false reports of dirty pages. Manual reprotection | ||
| 4774 | helps reducing this time, improving guest performance and reducing the | ||
| 4775 | number of dirty log false positives. | ||
| 4776 | |||
| 4777 | |||
| 4650 | 8. Other capabilities. | 4778 | 8. Other capabilities. |
| 4651 | ---------------------- | 4779 | ---------------------- |
| 4652 | 4780 | ||
diff --git a/MAINTAINERS b/MAINTAINERS index 80b377dda900..c4665d49dc50 100644 --- a/MAINTAINERS +++ b/MAINTAINERS | |||
| @@ -8309,6 +8309,7 @@ W: http://www.linux-kvm.org | |||
| 8309 | T: git git://git.kernel.org/pub/scm/virt/kvm/kvm.git | 8309 | T: git git://git.kernel.org/pub/scm/virt/kvm/kvm.git |
| 8310 | S: Supported | 8310 | S: Supported |
| 8311 | F: arch/x86/kvm/ | 8311 | F: arch/x86/kvm/ |
| 8312 | F: arch/x86/kvm/*/ | ||
| 8312 | F: arch/x86/include/uapi/asm/kvm* | 8313 | F: arch/x86/include/uapi/asm/kvm* |
| 8313 | F: arch/x86/include/asm/kvm* | 8314 | F: arch/x86/include/asm/kvm* |
| 8314 | F: arch/x86/include/asm/pvclock-abi.h | 8315 | F: arch/x86/include/asm/pvclock-abi.h |
diff --git a/arch/arm/include/asm/kvm_asm.h b/arch/arm/include/asm/kvm_asm.h index 231e87ad45d5..35491af87985 100644 --- a/arch/arm/include/asm/kvm_asm.h +++ b/arch/arm/include/asm/kvm_asm.h | |||
| @@ -23,6 +23,10 @@ | |||
| 23 | 23 | ||
| 24 | #define ARM_EXIT_WITH_ABORT_BIT 31 | 24 | #define ARM_EXIT_WITH_ABORT_BIT 31 |
| 25 | #define ARM_EXCEPTION_CODE(x) ((x) & ~(1U << ARM_EXIT_WITH_ABORT_BIT)) | 25 | #define ARM_EXCEPTION_CODE(x) ((x) & ~(1U << ARM_EXIT_WITH_ABORT_BIT)) |
| 26 | #define ARM_EXCEPTION_IS_TRAP(x) \ | ||
| 27 | (ARM_EXCEPTION_CODE((x)) == ARM_EXCEPTION_PREF_ABORT || \ | ||
| 28 | ARM_EXCEPTION_CODE((x)) == ARM_EXCEPTION_DATA_ABORT || \ | ||
| 29 | ARM_EXCEPTION_CODE((x)) == ARM_EXCEPTION_HVC) | ||
| 26 | #define ARM_ABORT_PENDING(x) !!((x) & (1U << ARM_EXIT_WITH_ABORT_BIT)) | 30 | #define ARM_ABORT_PENDING(x) !!((x) & (1U << ARM_EXIT_WITH_ABORT_BIT)) |
| 27 | 31 | ||
| 28 | #define ARM_EXCEPTION_RESET 0 | 32 | #define ARM_EXCEPTION_RESET 0 |
diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 2184d9ddb418..ca56537b61bc 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h | |||
| @@ -225,7 +225,7 @@ int __kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu, | |||
| 225 | #define KVM_ARCH_WANT_MMU_NOTIFIER | 225 | #define KVM_ARCH_WANT_MMU_NOTIFIER |
| 226 | int kvm_unmap_hva_range(struct kvm *kvm, | 226 | int kvm_unmap_hva_range(struct kvm *kvm, |
| 227 | unsigned long start, unsigned long end); | 227 | unsigned long start, unsigned long end); |
| 228 | void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); | 228 | int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); |
| 229 | 229 | ||
| 230 | unsigned long kvm_arm_num_regs(struct kvm_vcpu *vcpu); | 230 | unsigned long kvm_arm_num_regs(struct kvm_vcpu *vcpu); |
| 231 | int kvm_arm_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *indices); | 231 | int kvm_arm_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *indices); |
| @@ -296,11 +296,6 @@ static inline void kvm_arm_init_debug(void) {} | |||
| 296 | static inline void kvm_arm_setup_debug(struct kvm_vcpu *vcpu) {} | 296 | static inline void kvm_arm_setup_debug(struct kvm_vcpu *vcpu) {} |
| 297 | static inline void kvm_arm_clear_debug(struct kvm_vcpu *vcpu) {} | 297 | static inline void kvm_arm_clear_debug(struct kvm_vcpu *vcpu) {} |
| 298 | static inline void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu) {} | 298 | static inline void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu) {} |
| 299 | static inline bool kvm_arm_handle_step_debug(struct kvm_vcpu *vcpu, | ||
| 300 | struct kvm_run *run) | ||
| 301 | { | ||
| 302 | return false; | ||
| 303 | } | ||
| 304 | 299 | ||
| 305 | int kvm_arm_vcpu_arch_set_attr(struct kvm_vcpu *vcpu, | 300 | int kvm_arm_vcpu_arch_set_attr(struct kvm_vcpu *vcpu, |
| 306 | struct kvm_device_attr *attr); | 301 | struct kvm_device_attr *attr); |
diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h index 1098ffc3d54b..3a875fc1b63c 100644 --- a/arch/arm/include/asm/kvm_mmu.h +++ b/arch/arm/include/asm/kvm_mmu.h | |||
| @@ -82,6 +82,67 @@ void kvm_clear_hyp_idmap(void); | |||
| 82 | #define kvm_mk_pud(pmdp) __pud(__pa(pmdp) | PMD_TYPE_TABLE) | 82 | #define kvm_mk_pud(pmdp) __pud(__pa(pmdp) | PMD_TYPE_TABLE) |
| 83 | #define kvm_mk_pgd(pudp) ({ BUILD_BUG(); 0; }) | 83 | #define kvm_mk_pgd(pudp) ({ BUILD_BUG(); 0; }) |
| 84 | 84 | ||
| 85 | #define kvm_pfn_pte(pfn, prot) pfn_pte(pfn, prot) | ||
| 86 | #define kvm_pfn_pmd(pfn, prot) pfn_pmd(pfn, prot) | ||
| 87 | #define kvm_pfn_pud(pfn, prot) (__pud(0)) | ||
| 88 | |||
| 89 | #define kvm_pud_pfn(pud) ({ WARN_ON(1); 0; }) | ||
| 90 | |||
| 91 | |||
| 92 | #define kvm_pmd_mkhuge(pmd) pmd_mkhuge(pmd) | ||
| 93 | /* No support for pud hugepages */ | ||
| 94 | #define kvm_pud_mkhuge(pud) ( {WARN_ON(1); pud; }) | ||
| 95 | |||
| 96 | /* | ||
| 97 | * The following kvm_*pud*() functions are provided strictly to allow | ||
| 98 | * sharing code with arm64. They should never be called in practice. | ||
| 99 | */ | ||
| 100 | static inline void kvm_set_s2pud_readonly(pud_t *pud) | ||
| 101 | { | ||
| 102 | WARN_ON(1); | ||
| 103 | } | ||
| 104 | |||
| 105 | static inline bool kvm_s2pud_readonly(pud_t *pud) | ||
| 106 | { | ||
| 107 | WARN_ON(1); | ||
| 108 | return false; | ||
| 109 | } | ||
| 110 | |||
| 111 | static inline void kvm_set_pud(pud_t *pud, pud_t new_pud) | ||
| 112 | { | ||
| 113 | WARN_ON(1); | ||
| 114 | } | ||
| 115 | |||
| 116 | static inline pud_t kvm_s2pud_mkwrite(pud_t pud) | ||
| 117 | { | ||
| 118 | WARN_ON(1); | ||
| 119 | return pud; | ||
| 120 | } | ||
| 121 | |||
| 122 | static inline pud_t kvm_s2pud_mkexec(pud_t pud) | ||
| 123 | { | ||
| 124 | WARN_ON(1); | ||
| 125 | return pud; | ||
| 126 | } | ||
| 127 | |||
| 128 | static inline bool kvm_s2pud_exec(pud_t *pud) | ||
| 129 | { | ||
| 130 | WARN_ON(1); | ||
| 131 | return false; | ||
| 132 | } | ||
| 133 | |||
| 134 | static inline pud_t kvm_s2pud_mkyoung(pud_t pud) | ||
| 135 | { | ||
| 136 | BUG(); | ||
| 137 | return pud; | ||
| 138 | } | ||
| 139 | |||
| 140 | static inline bool kvm_s2pud_young(pud_t pud) | ||
| 141 | { | ||
| 142 | WARN_ON(1); | ||
| 143 | return false; | ||
| 144 | } | ||
| 145 | |||
| 85 | static inline pte_t kvm_s2pte_mkwrite(pte_t pte) | 146 | static inline pte_t kvm_s2pte_mkwrite(pte_t pte) |
| 86 | { | 147 | { |
| 87 | pte_val(pte) |= L_PTE_S2_RDWR; | 148 | pte_val(pte) |= L_PTE_S2_RDWR; |
diff --git a/arch/arm/include/asm/stage2_pgtable.h b/arch/arm/include/asm/stage2_pgtable.h index f6a7ea805232..c4b1d4fb1797 100644 --- a/arch/arm/include/asm/stage2_pgtable.h +++ b/arch/arm/include/asm/stage2_pgtable.h | |||
| @@ -68,4 +68,12 @@ stage2_pmd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) | |||
| 68 | #define stage2_pmd_table_empty(kvm, pmdp) kvm_page_empty(pmdp) | 68 | #define stage2_pmd_table_empty(kvm, pmdp) kvm_page_empty(pmdp) |
| 69 | #define stage2_pud_table_empty(kvm, pudp) false | 69 | #define stage2_pud_table_empty(kvm, pudp) false |
| 70 | 70 | ||
| 71 | static inline bool kvm_stage2_has_pud(struct kvm *kvm) | ||
| 72 | { | ||
| 73 | return false; | ||
| 74 | } | ||
| 75 | |||
| 76 | #define S2_PMD_MASK PMD_MASK | ||
| 77 | #define S2_PMD_SIZE PMD_SIZE | ||
| 78 | |||
| 71 | #endif /* __ARM_S2_PGTABLE_H_ */ | 79 | #endif /* __ARM_S2_PGTABLE_H_ */ |
diff --git a/arch/arm/kvm/coproc.c b/arch/arm/kvm/coproc.c index cb094e55dc5f..222c1635bc7a 100644 --- a/arch/arm/kvm/coproc.c +++ b/arch/arm/kvm/coproc.c | |||
| @@ -602,8 +602,8 @@ static int emulate_cp15(struct kvm_vcpu *vcpu, | |||
| 602 | } | 602 | } |
| 603 | } else { | 603 | } else { |
| 604 | /* If access function fails, it should complain. */ | 604 | /* If access function fails, it should complain. */ |
| 605 | kvm_err("Unsupported guest CP15 access at: %08lx\n", | 605 | kvm_err("Unsupported guest CP15 access at: %08lx [%08lx]\n", |
| 606 | *vcpu_pc(vcpu)); | 606 | *vcpu_pc(vcpu), *vcpu_cpsr(vcpu)); |
| 607 | print_cp_instr(params); | 607 | print_cp_instr(params); |
| 608 | kvm_inject_undefined(vcpu); | 608 | kvm_inject_undefined(vcpu); |
| 609 | } | 609 | } |
diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index f9123fe8fcf3..7f9d2bfcf82e 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h | |||
| @@ -107,7 +107,7 @@ | |||
| 107 | TCR_EL2_ORGN0_MASK | TCR_EL2_IRGN0_MASK | TCR_EL2_T0SZ_MASK) | 107 | TCR_EL2_ORGN0_MASK | TCR_EL2_IRGN0_MASK | TCR_EL2_T0SZ_MASK) |
| 108 | 108 | ||
| 109 | /* VTCR_EL2 Registers bits */ | 109 | /* VTCR_EL2 Registers bits */ |
| 110 | #define VTCR_EL2_RES1 (1 << 31) | 110 | #define VTCR_EL2_RES1 (1U << 31) |
| 111 | #define VTCR_EL2_HD (1 << 22) | 111 | #define VTCR_EL2_HD (1 << 22) |
| 112 | #define VTCR_EL2_HA (1 << 21) | 112 | #define VTCR_EL2_HA (1 << 21) |
| 113 | #define VTCR_EL2_PS_SHIFT TCR_EL2_PS_SHIFT | 113 | #define VTCR_EL2_PS_SHIFT TCR_EL2_PS_SHIFT |
| @@ -323,10 +323,6 @@ | |||
| 323 | #define PAR_TO_HPFAR(par) \ | 323 | #define PAR_TO_HPFAR(par) \ |
| 324 | (((par) & GENMASK_ULL(PHYS_MASK_SHIFT - 1, 12)) >> 8) | 324 | (((par) & GENMASK_ULL(PHYS_MASK_SHIFT - 1, 12)) >> 8) |
| 325 | 325 | ||
| 326 | #define kvm_arm_exception_type \ | ||
| 327 | {0, "IRQ" }, \ | ||
| 328 | {1, "TRAP" } | ||
| 329 | |||
| 330 | #define ECN(x) { ESR_ELx_EC_##x, #x } | 326 | #define ECN(x) { ESR_ELx_EC_##x, #x } |
| 331 | 327 | ||
| 332 | #define kvm_arm_exception_class \ | 328 | #define kvm_arm_exception_class \ |
diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index aea01a09eb94..f5b79e995f40 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h | |||
| @@ -25,6 +25,7 @@ | |||
| 25 | 25 | ||
| 26 | #define ARM_EXIT_WITH_SERROR_BIT 31 | 26 | #define ARM_EXIT_WITH_SERROR_BIT 31 |
| 27 | #define ARM_EXCEPTION_CODE(x) ((x) & ~(1U << ARM_EXIT_WITH_SERROR_BIT)) | 27 | #define ARM_EXCEPTION_CODE(x) ((x) & ~(1U << ARM_EXIT_WITH_SERROR_BIT)) |
| 28 | #define ARM_EXCEPTION_IS_TRAP(x) (ARM_EXCEPTION_CODE((x)) == ARM_EXCEPTION_TRAP) | ||
| 28 | #define ARM_SERROR_PENDING(x) !!((x) & (1U << ARM_EXIT_WITH_SERROR_BIT)) | 29 | #define ARM_SERROR_PENDING(x) !!((x) & (1U << ARM_EXIT_WITH_SERROR_BIT)) |
| 29 | 30 | ||
| 30 | #define ARM_EXCEPTION_IRQ 0 | 31 | #define ARM_EXCEPTION_IRQ 0 |
| @@ -34,6 +35,12 @@ | |||
| 34 | /* The hyp-stub will return this for any kvm_call_hyp() call */ | 35 | /* The hyp-stub will return this for any kvm_call_hyp() call */ |
| 35 | #define ARM_EXCEPTION_HYP_GONE HVC_STUB_ERR | 36 | #define ARM_EXCEPTION_HYP_GONE HVC_STUB_ERR |
| 36 | 37 | ||
| 38 | #define kvm_arm_exception_type \ | ||
| 39 | {ARM_EXCEPTION_IRQ, "IRQ" }, \ | ||
| 40 | {ARM_EXCEPTION_EL1_SERROR, "SERROR" }, \ | ||
| 41 | {ARM_EXCEPTION_TRAP, "TRAP" }, \ | ||
| 42 | {ARM_EXCEPTION_HYP_GONE, "HYP_GONE" } | ||
| 43 | |||
| 37 | #ifndef __ASSEMBLY__ | 44 | #ifndef __ASSEMBLY__ |
| 38 | 45 | ||
| 39 | #include <linux/mm.h> | 46 | #include <linux/mm.h> |
diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index 21247870def7..506386a3edde 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h | |||
| @@ -24,6 +24,7 @@ | |||
| 24 | 24 | ||
| 25 | #include <linux/kvm_host.h> | 25 | #include <linux/kvm_host.h> |
| 26 | 26 | ||
| 27 | #include <asm/debug-monitors.h> | ||
| 27 | #include <asm/esr.h> | 28 | #include <asm/esr.h> |
| 28 | #include <asm/kvm_arm.h> | 29 | #include <asm/kvm_arm.h> |
| 29 | #include <asm/kvm_hyp.h> | 30 | #include <asm/kvm_hyp.h> |
| @@ -147,14 +148,6 @@ static inline bool kvm_condition_valid(const struct kvm_vcpu *vcpu) | |||
| 147 | return true; | 148 | return true; |
| 148 | } | 149 | } |
| 149 | 150 | ||
| 150 | static inline void kvm_skip_instr(struct kvm_vcpu *vcpu, bool is_wide_instr) | ||
| 151 | { | ||
| 152 | if (vcpu_mode_is_32bit(vcpu)) | ||
| 153 | kvm_skip_instr32(vcpu, is_wide_instr); | ||
| 154 | else | ||
| 155 | *vcpu_pc(vcpu) += 4; | ||
| 156 | } | ||
| 157 | |||
| 158 | static inline void vcpu_set_thumb(struct kvm_vcpu *vcpu) | 151 | static inline void vcpu_set_thumb(struct kvm_vcpu *vcpu) |
| 159 | { | 152 | { |
| 160 | *vcpu_cpsr(vcpu) |= PSR_AA32_T_BIT; | 153 | *vcpu_cpsr(vcpu) |= PSR_AA32_T_BIT; |
| @@ -424,4 +417,30 @@ static inline unsigned long vcpu_data_host_to_guest(struct kvm_vcpu *vcpu, | |||
| 424 | return data; /* Leave LE untouched */ | 417 | return data; /* Leave LE untouched */ |
| 425 | } | 418 | } |
| 426 | 419 | ||
| 420 | static inline void kvm_skip_instr(struct kvm_vcpu *vcpu, bool is_wide_instr) | ||
| 421 | { | ||
| 422 | if (vcpu_mode_is_32bit(vcpu)) | ||
| 423 | kvm_skip_instr32(vcpu, is_wide_instr); | ||
| 424 | else | ||
| 425 | *vcpu_pc(vcpu) += 4; | ||
| 426 | |||
| 427 | /* advance the singlestep state machine */ | ||
| 428 | *vcpu_cpsr(vcpu) &= ~DBG_SPSR_SS; | ||
| 429 | } | ||
| 430 | |||
| 431 | /* | ||
| 432 | * Skip an instruction which has been emulated at hyp while most guest sysregs | ||
| 433 | * are live. | ||
| 434 | */ | ||
| 435 | static inline void __hyp_text __kvm_skip_instr(struct kvm_vcpu *vcpu) | ||
| 436 | { | ||
| 437 | *vcpu_pc(vcpu) = read_sysreg_el2(elr); | ||
| 438 | vcpu->arch.ctxt.gp_regs.regs.pstate = read_sysreg_el2(spsr); | ||
| 439 | |||
| 440 | kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu)); | ||
| 441 | |||
| 442 | write_sysreg_el2(vcpu->arch.ctxt.gp_regs.regs.pstate, spsr); | ||
| 443 | write_sysreg_el2(*vcpu_pc(vcpu), elr); | ||
| 444 | } | ||
| 445 | |||
| 427 | #endif /* __ARM64_KVM_EMULATE_H__ */ | 446 | #endif /* __ARM64_KVM_EMULATE_H__ */ |
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 9217759afa6b..7732d0ba4e60 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h | |||
| @@ -319,7 +319,7 @@ struct kvm_vcpu_arch { | |||
| 319 | */ | 319 | */ |
| 320 | #define __vcpu_sys_reg(v,r) ((v)->arch.ctxt.sys_regs[(r)]) | 320 | #define __vcpu_sys_reg(v,r) ((v)->arch.ctxt.sys_regs[(r)]) |
| 321 | 321 | ||
| 322 | u64 vcpu_read_sys_reg(struct kvm_vcpu *vcpu, int reg); | 322 | u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg); |
| 323 | void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg); | 323 | void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg); |
| 324 | 324 | ||
| 325 | /* | 325 | /* |
| @@ -360,7 +360,7 @@ int __kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu, | |||
| 360 | #define KVM_ARCH_WANT_MMU_NOTIFIER | 360 | #define KVM_ARCH_WANT_MMU_NOTIFIER |
| 361 | int kvm_unmap_hva_range(struct kvm *kvm, | 361 | int kvm_unmap_hva_range(struct kvm *kvm, |
| 362 | unsigned long start, unsigned long end); | 362 | unsigned long start, unsigned long end); |
| 363 | void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); | 363 | int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); |
| 364 | int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end); | 364 | int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end); |
| 365 | int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); | 365 | int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); |
| 366 | 366 | ||
| @@ -449,7 +449,6 @@ void kvm_arm_init_debug(void); | |||
| 449 | void kvm_arm_setup_debug(struct kvm_vcpu *vcpu); | 449 | void kvm_arm_setup_debug(struct kvm_vcpu *vcpu); |
| 450 | void kvm_arm_clear_debug(struct kvm_vcpu *vcpu); | 450 | void kvm_arm_clear_debug(struct kvm_vcpu *vcpu); |
| 451 | void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu); | 451 | void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu); |
| 452 | bool kvm_arm_handle_step_debug(struct kvm_vcpu *vcpu, struct kvm_run *run); | ||
| 453 | int kvm_arm_vcpu_arch_set_attr(struct kvm_vcpu *vcpu, | 452 | int kvm_arm_vcpu_arch_set_attr(struct kvm_vcpu *vcpu, |
| 454 | struct kvm_device_attr *attr); | 453 | struct kvm_device_attr *attr); |
| 455 | int kvm_arm_vcpu_arch_get_attr(struct kvm_vcpu *vcpu, | 454 | int kvm_arm_vcpu_arch_get_attr(struct kvm_vcpu *vcpu, |
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 658657367f2f..8af4b1befa42 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h | |||
| @@ -184,6 +184,17 @@ void kvm_clear_hyp_idmap(void); | |||
| 184 | #define kvm_mk_pgd(pudp) \ | 184 | #define kvm_mk_pgd(pudp) \ |
| 185 | __pgd(__phys_to_pgd_val(__pa(pudp)) | PUD_TYPE_TABLE) | 185 | __pgd(__phys_to_pgd_val(__pa(pudp)) | PUD_TYPE_TABLE) |
| 186 | 186 | ||
| 187 | #define kvm_set_pud(pudp, pud) set_pud(pudp, pud) | ||
| 188 | |||
| 189 | #define kvm_pfn_pte(pfn, prot) pfn_pte(pfn, prot) | ||
| 190 | #define kvm_pfn_pmd(pfn, prot) pfn_pmd(pfn, prot) | ||
| 191 | #define kvm_pfn_pud(pfn, prot) pfn_pud(pfn, prot) | ||
| 192 | |||
| 193 | #define kvm_pud_pfn(pud) pud_pfn(pud) | ||
| 194 | |||
| 195 | #define kvm_pmd_mkhuge(pmd) pmd_mkhuge(pmd) | ||
| 196 | #define kvm_pud_mkhuge(pud) pud_mkhuge(pud) | ||
| 197 | |||
| 187 | static inline pte_t kvm_s2pte_mkwrite(pte_t pte) | 198 | static inline pte_t kvm_s2pte_mkwrite(pte_t pte) |
| 188 | { | 199 | { |
| 189 | pte_val(pte) |= PTE_S2_RDWR; | 200 | pte_val(pte) |= PTE_S2_RDWR; |
| @@ -196,6 +207,12 @@ static inline pmd_t kvm_s2pmd_mkwrite(pmd_t pmd) | |||
| 196 | return pmd; | 207 | return pmd; |
| 197 | } | 208 | } |
| 198 | 209 | ||
| 210 | static inline pud_t kvm_s2pud_mkwrite(pud_t pud) | ||
| 211 | { | ||
| 212 | pud_val(pud) |= PUD_S2_RDWR; | ||
| 213 | return pud; | ||
| 214 | } | ||
| 215 | |||
| 199 | static inline pte_t kvm_s2pte_mkexec(pte_t pte) | 216 | static inline pte_t kvm_s2pte_mkexec(pte_t pte) |
| 200 | { | 217 | { |
| 201 | pte_val(pte) &= ~PTE_S2_XN; | 218 | pte_val(pte) &= ~PTE_S2_XN; |
| @@ -208,6 +225,12 @@ static inline pmd_t kvm_s2pmd_mkexec(pmd_t pmd) | |||
| 208 | return pmd; | 225 | return pmd; |
| 209 | } | 226 | } |
| 210 | 227 | ||
| 228 | static inline pud_t kvm_s2pud_mkexec(pud_t pud) | ||
| 229 | { | ||
| 230 | pud_val(pud) &= ~PUD_S2_XN; | ||
| 231 | return pud; | ||
| 232 | } | ||
| 233 | |||
| 211 | static inline void kvm_set_s2pte_readonly(pte_t *ptep) | 234 | static inline void kvm_set_s2pte_readonly(pte_t *ptep) |
| 212 | { | 235 | { |
| 213 | pteval_t old_pteval, pteval; | 236 | pteval_t old_pteval, pteval; |
| @@ -246,6 +269,31 @@ static inline bool kvm_s2pmd_exec(pmd_t *pmdp) | |||
| 246 | return !(READ_ONCE(pmd_val(*pmdp)) & PMD_S2_XN); | 269 | return !(READ_ONCE(pmd_val(*pmdp)) & PMD_S2_XN); |
| 247 | } | 270 | } |
| 248 | 271 | ||
| 272 | static inline void kvm_set_s2pud_readonly(pud_t *pudp) | ||
| 273 | { | ||
| 274 | kvm_set_s2pte_readonly((pte_t *)pudp); | ||
| 275 | } | ||
| 276 | |||
| 277 | static inline bool kvm_s2pud_readonly(pud_t *pudp) | ||
| 278 | { | ||
| 279 | return kvm_s2pte_readonly((pte_t *)pudp); | ||
| 280 | } | ||
| 281 | |||
| 282 | static inline bool kvm_s2pud_exec(pud_t *pudp) | ||
| 283 | { | ||
| 284 | return !(READ_ONCE(pud_val(*pudp)) & PUD_S2_XN); | ||
| 285 | } | ||
| 286 | |||
| 287 | static inline pud_t kvm_s2pud_mkyoung(pud_t pud) | ||
| 288 | { | ||
| 289 | return pud_mkyoung(pud); | ||
| 290 | } | ||
| 291 | |||
| 292 | static inline bool kvm_s2pud_young(pud_t pud) | ||
| 293 | { | ||
| 294 | return pud_young(pud); | ||
| 295 | } | ||
| 296 | |||
| 249 | #define hyp_pte_table_empty(ptep) kvm_page_empty(ptep) | 297 | #define hyp_pte_table_empty(ptep) kvm_page_empty(ptep) |
| 250 | 298 | ||
| 251 | #ifdef __PAGETABLE_PMD_FOLDED | 299 | #ifdef __PAGETABLE_PMD_FOLDED |
diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h index 54a37660b8c9..22bb3ae514f5 100644 --- a/arch/arm64/include/asm/pgtable-hwdef.h +++ b/arch/arm64/include/asm/pgtable-hwdef.h | |||
| @@ -193,6 +193,10 @@ | |||
| 193 | #define PMD_S2_RDWR (_AT(pmdval_t, 3) << 6) /* HAP[2:1] */ | 193 | #define PMD_S2_RDWR (_AT(pmdval_t, 3) << 6) /* HAP[2:1] */ |
| 194 | #define PMD_S2_XN (_AT(pmdval_t, 2) << 53) /* XN[1:0] */ | 194 | #define PMD_S2_XN (_AT(pmdval_t, 2) << 53) /* XN[1:0] */ |
| 195 | 195 | ||
| 196 | #define PUD_S2_RDONLY (_AT(pudval_t, 1) << 6) /* HAP[2:1] */ | ||
| 197 | #define PUD_S2_RDWR (_AT(pudval_t, 3) << 6) /* HAP[2:1] */ | ||
| 198 | #define PUD_S2_XN (_AT(pudval_t, 2) << 53) /* XN[1:0] */ | ||
| 199 | |||
| 196 | /* | 200 | /* |
| 197 | * Memory Attribute override for Stage-2 (MemAttr[3:0]) | 201 | * Memory Attribute override for Stage-2 (MemAttr[3:0]) |
| 198 | */ | 202 | */ |
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 5bbb59c81920..de70c1eabf33 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h | |||
| @@ -315,6 +315,11 @@ static inline pte_t pud_pte(pud_t pud) | |||
| 315 | return __pte(pud_val(pud)); | 315 | return __pte(pud_val(pud)); |
| 316 | } | 316 | } |
| 317 | 317 | ||
| 318 | static inline pud_t pte_pud(pte_t pte) | ||
| 319 | { | ||
| 320 | return __pud(pte_val(pte)); | ||
| 321 | } | ||
| 322 | |||
| 318 | static inline pmd_t pud_pmd(pud_t pud) | 323 | static inline pmd_t pud_pmd(pud_t pud) |
| 319 | { | 324 | { |
| 320 | return __pmd(pud_val(pud)); | 325 | return __pmd(pud_val(pud)); |
| @@ -382,8 +387,12 @@ static inline int pmd_protnone(pmd_t pmd) | |||
| 382 | #define pfn_pmd(pfn,prot) __pmd(__phys_to_pmd_val((phys_addr_t)(pfn) << PAGE_SHIFT) | pgprot_val(prot)) | 387 | #define pfn_pmd(pfn,prot) __pmd(__phys_to_pmd_val((phys_addr_t)(pfn) << PAGE_SHIFT) | pgprot_val(prot)) |
| 383 | #define mk_pmd(page,prot) pfn_pmd(page_to_pfn(page),prot) | 388 | #define mk_pmd(page,prot) pfn_pmd(page_to_pfn(page),prot) |
| 384 | 389 | ||
| 390 | #define pud_young(pud) pte_young(pud_pte(pud)) | ||
| 391 | #define pud_mkyoung(pud) pte_pud(pte_mkyoung(pud_pte(pud))) | ||
| 385 | #define pud_write(pud) pte_write(pud_pte(pud)) | 392 | #define pud_write(pud) pte_write(pud_pte(pud)) |
| 386 | 393 | ||
| 394 | #define pud_mkhuge(pud) (__pud(pud_val(pud) & ~PUD_TABLE_BIT)) | ||
| 395 | |||
| 387 | #define __pud_to_phys(pud) __pte_to_phys(pud_pte(pud)) | 396 | #define __pud_to_phys(pud) __pte_to_phys(pud_pte(pud)) |
| 388 | #define __phys_to_pud_val(phys) __phys_to_pte_val(phys) | 397 | #define __phys_to_pud_val(phys) __phys_to_pte_val(phys) |
| 389 | #define pud_pfn(pud) ((__pud_to_phys(pud) & PUD_MASK) >> PAGE_SHIFT) | 398 | #define pud_pfn(pud) ((__pud_to_phys(pud) & PUD_MASK) >> PAGE_SHIFT) |
diff --git a/arch/arm64/include/asm/stage2_pgtable.h b/arch/arm64/include/asm/stage2_pgtable.h index d352f6df8d2c..5412fa40825e 100644 --- a/arch/arm64/include/asm/stage2_pgtable.h +++ b/arch/arm64/include/asm/stage2_pgtable.h | |||
| @@ -30,16 +30,14 @@ | |||
| 30 | #define pt_levels_pgdir_shift(lvls) ARM64_HW_PGTABLE_LEVEL_SHIFT(4 - (lvls)) | 30 | #define pt_levels_pgdir_shift(lvls) ARM64_HW_PGTABLE_LEVEL_SHIFT(4 - (lvls)) |
| 31 | 31 | ||
| 32 | /* | 32 | /* |
| 33 | * The hardware supports concatenation of up to 16 tables at stage2 entry level | 33 | * The hardware supports concatenation of up to 16 tables at stage2 entry |
| 34 | * and we use the feature whenever possible. | 34 | * level and we use the feature whenever possible, which means we resolve 4 |
| 35 | * additional bits of address at the entry level. | ||
| 35 | * | 36 | * |
| 36 | * Now, the minimum number of bits resolved at any level is (PAGE_SHIFT - 3). | 37 | * This implies, the total number of page table levels required for |
| 37 | * On arm64, the smallest PAGE_SIZE supported is 4k, which means | 38 | * IPA_SHIFT at stage2 expected by the hardware can be calculated using |
| 38 | * (PAGE_SHIFT - 3) > 4 holds for all page sizes. | 39 | * the same logic used for the (non-collapsable) stage1 page tables but for |
| 39 | * This implies, the total number of page table levels at stage2 expected | 40 | * (IPA_SHIFT - 4). |
| 40 | * by the hardware is actually the number of levels required for (IPA_SHIFT - 4) | ||
| 41 | * in normal translations(e.g, stage1), since we cannot have another level in | ||
| 42 | * the range (IPA_SHIFT, IPA_SHIFT - 4). | ||
| 43 | */ | 41 | */ |
| 44 | #define stage2_pgtable_levels(ipa) ARM64_HW_PGTABLE_LEVELS((ipa) - 4) | 42 | #define stage2_pgtable_levels(ipa) ARM64_HW_PGTABLE_LEVELS((ipa) - 4) |
| 45 | #define kvm_stage2_levels(kvm) VTCR_EL2_LVLS(kvm->arch.vtcr) | 43 | #define kvm_stage2_levels(kvm) VTCR_EL2_LVLS(kvm->arch.vtcr) |
diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c index 00d422336a45..f39801e4136c 100644 --- a/arch/arm64/kvm/debug.c +++ b/arch/arm64/kvm/debug.c | |||
| @@ -236,24 +236,3 @@ void kvm_arm_clear_debug(struct kvm_vcpu *vcpu) | |||
| 236 | } | 236 | } |
| 237 | } | 237 | } |
| 238 | } | 238 | } |
| 239 | |||
| 240 | |||
| 241 | /* | ||
| 242 | * After successfully emulating an instruction, we might want to | ||
| 243 | * return to user space with a KVM_EXIT_DEBUG. We can only do this | ||
| 244 | * once the emulation is complete, though, so for userspace emulations | ||
| 245 | * we have to wait until we have re-entered KVM before calling this | ||
| 246 | * helper. | ||
| 247 | * | ||
| 248 | * Return true (and set exit_reason) to return to userspace or false | ||
| 249 | * if no further action is required. | ||
| 250 | */ | ||
| 251 | bool kvm_arm_handle_step_debug(struct kvm_vcpu *vcpu, struct kvm_run *run) | ||
| 252 | { | ||
| 253 | if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) { | ||
| 254 | run->exit_reason = KVM_EXIT_DEBUG; | ||
| 255 | run->debug.arch.hsr = ESR_ELx_EC_SOFTSTP_LOW << ESR_ELx_EC_SHIFT; | ||
| 256 | return true; | ||
| 257 | } | ||
| 258 | return false; | ||
| 259 | } | ||
diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c index ab35929dcb3c..0b7983442071 100644 --- a/arch/arm64/kvm/handle_exit.c +++ b/arch/arm64/kvm/handle_exit.c | |||
| @@ -247,13 +247,6 @@ static int handle_trap_exceptions(struct kvm_vcpu *vcpu, struct kvm_run *run) | |||
| 247 | handled = exit_handler(vcpu, run); | 247 | handled = exit_handler(vcpu, run); |
| 248 | } | 248 | } |
| 249 | 249 | ||
| 250 | /* | ||
| 251 | * kvm_arm_handle_step_debug() sets the exit_reason on the kvm_run | ||
| 252 | * structure if we need to return to userspace. | ||
| 253 | */ | ||
| 254 | if (handled > 0 && kvm_arm_handle_step_debug(vcpu, run)) | ||
| 255 | handled = 0; | ||
| 256 | |||
| 257 | return handled; | 250 | return handled; |
| 258 | } | 251 | } |
| 259 | 252 | ||
| @@ -287,12 +280,7 @@ int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run, | |||
| 287 | case ARM_EXCEPTION_IRQ: | 280 | case ARM_EXCEPTION_IRQ: |
| 288 | return 1; | 281 | return 1; |
| 289 | case ARM_EXCEPTION_EL1_SERROR: | 282 | case ARM_EXCEPTION_EL1_SERROR: |
| 290 | /* We may still need to return for single-step */ | 283 | return 1; |
| 291 | if (!(*vcpu_cpsr(vcpu) & DBG_SPSR_SS) | ||
| 292 | && kvm_arm_handle_step_debug(vcpu, run)) | ||
| 293 | return 0; | ||
| 294 | else | ||
| 295 | return 1; | ||
| 296 | case ARM_EXCEPTION_TRAP: | 284 | case ARM_EXCEPTION_TRAP: |
| 297 | return handle_trap_exceptions(vcpu, run); | 285 | return handle_trap_exceptions(vcpu, run); |
| 298 | case ARM_EXCEPTION_HYP_GONE: | 286 | case ARM_EXCEPTION_HYP_GONE: |
diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c index 63ac10ead3a8..b0b1478094b4 100644 --- a/arch/arm64/kvm/hyp/switch.c +++ b/arch/arm64/kvm/hyp/switch.c | |||
| @@ -313,33 +313,6 @@ static bool __hyp_text __populate_fault_info(struct kvm_vcpu *vcpu) | |||
| 313 | return true; | 313 | return true; |
| 314 | } | 314 | } |
| 315 | 315 | ||
| 316 | /* Skip an instruction which has been emulated. Returns true if | ||
| 317 | * execution can continue or false if we need to exit hyp mode because | ||
| 318 | * single-step was in effect. | ||
| 319 | */ | ||
| 320 | static bool __hyp_text __skip_instr(struct kvm_vcpu *vcpu) | ||
| 321 | { | ||
| 322 | *vcpu_pc(vcpu) = read_sysreg_el2(elr); | ||
| 323 | |||
| 324 | if (vcpu_mode_is_32bit(vcpu)) { | ||
| 325 | vcpu->arch.ctxt.gp_regs.regs.pstate = read_sysreg_el2(spsr); | ||
| 326 | kvm_skip_instr32(vcpu, kvm_vcpu_trap_il_is32bit(vcpu)); | ||
| 327 | write_sysreg_el2(vcpu->arch.ctxt.gp_regs.regs.pstate, spsr); | ||
| 328 | } else { | ||
| 329 | *vcpu_pc(vcpu) += 4; | ||
| 330 | } | ||
| 331 | |||
| 332 | write_sysreg_el2(*vcpu_pc(vcpu), elr); | ||
| 333 | |||
| 334 | if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) { | ||
| 335 | vcpu->arch.fault.esr_el2 = | ||
| 336 | (ESR_ELx_EC_SOFTSTP_LOW << ESR_ELx_EC_SHIFT) | 0x22; | ||
| 337 | return false; | ||
| 338 | } else { | ||
| 339 | return true; | ||
| 340 | } | ||
| 341 | } | ||
| 342 | |||
| 343 | static bool __hyp_text __hyp_switch_fpsimd(struct kvm_vcpu *vcpu) | 316 | static bool __hyp_text __hyp_switch_fpsimd(struct kvm_vcpu *vcpu) |
| 344 | { | 317 | { |
| 345 | struct user_fpsimd_state *host_fpsimd = vcpu->arch.host_fpsimd_state; | 318 | struct user_fpsimd_state *host_fpsimd = vcpu->arch.host_fpsimd_state; |
| @@ -428,20 +401,12 @@ static bool __hyp_text fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code) | |||
| 428 | if (valid) { | 401 | if (valid) { |
| 429 | int ret = __vgic_v2_perform_cpuif_access(vcpu); | 402 | int ret = __vgic_v2_perform_cpuif_access(vcpu); |
| 430 | 403 | ||
| 431 | if (ret == 1 && __skip_instr(vcpu)) | 404 | if (ret == 1) |
| 432 | return true; | 405 | return true; |
| 433 | 406 | ||
| 434 | if (ret == -1) { | 407 | /* Promote an illegal access to an SError.*/ |
| 435 | /* Promote an illegal access to an | 408 | if (ret == -1) |
| 436 | * SError. If we would be returning | ||
| 437 | * due to single-step clear the SS | ||
| 438 | * bit so handle_exit knows what to | ||
| 439 | * do after dealing with the error. | ||
| 440 | */ | ||
| 441 | if (!__skip_instr(vcpu)) | ||
| 442 | *vcpu_cpsr(vcpu) &= ~DBG_SPSR_SS; | ||
| 443 | *exit_code = ARM_EXCEPTION_EL1_SERROR; | 409 | *exit_code = ARM_EXCEPTION_EL1_SERROR; |
| 444 | } | ||
| 445 | 410 | ||
| 446 | goto exit; | 411 | goto exit; |
| 447 | } | 412 | } |
| @@ -452,7 +417,7 @@ static bool __hyp_text fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code) | |||
| 452 | kvm_vcpu_trap_get_class(vcpu) == ESR_ELx_EC_CP15_32)) { | 417 | kvm_vcpu_trap_get_class(vcpu) == ESR_ELx_EC_CP15_32)) { |
| 453 | int ret = __vgic_v3_perform_cpuif_access(vcpu); | 418 | int ret = __vgic_v3_perform_cpuif_access(vcpu); |
| 454 | 419 | ||
| 455 | if (ret == 1 && __skip_instr(vcpu)) | 420 | if (ret == 1) |
| 456 | return true; | 421 | return true; |
| 457 | } | 422 | } |
| 458 | 423 | ||
diff --git a/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c b/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c index 215c7c0eb3b0..9cbdd034a563 100644 --- a/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c +++ b/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c | |||
| @@ -41,7 +41,7 @@ static bool __hyp_text __is_be(struct kvm_vcpu *vcpu) | |||
| 41 | * Returns: | 41 | * Returns: |
| 42 | * 1: GICV access successfully performed | 42 | * 1: GICV access successfully performed |
| 43 | * 0: Not a GICV access | 43 | * 0: Not a GICV access |
| 44 | * -1: Illegal GICV access | 44 | * -1: Illegal GICV access successfully performed |
| 45 | */ | 45 | */ |
| 46 | int __hyp_text __vgic_v2_perform_cpuif_access(struct kvm_vcpu *vcpu) | 46 | int __hyp_text __vgic_v2_perform_cpuif_access(struct kvm_vcpu *vcpu) |
| 47 | { | 47 | { |
| @@ -61,12 +61,16 @@ int __hyp_text __vgic_v2_perform_cpuif_access(struct kvm_vcpu *vcpu) | |||
| 61 | return 0; | 61 | return 0; |
| 62 | 62 | ||
| 63 | /* Reject anything but a 32bit access */ | 63 | /* Reject anything but a 32bit access */ |
| 64 | if (kvm_vcpu_dabt_get_as(vcpu) != sizeof(u32)) | 64 | if (kvm_vcpu_dabt_get_as(vcpu) != sizeof(u32)) { |
| 65 | __kvm_skip_instr(vcpu); | ||
| 65 | return -1; | 66 | return -1; |
| 67 | } | ||
| 66 | 68 | ||
| 67 | /* Not aligned? Don't bother */ | 69 | /* Not aligned? Don't bother */ |
| 68 | if (fault_ipa & 3) | 70 | if (fault_ipa & 3) { |
| 71 | __kvm_skip_instr(vcpu); | ||
| 69 | return -1; | 72 | return -1; |
| 73 | } | ||
| 70 | 74 | ||
| 71 | rd = kvm_vcpu_dabt_get_rd(vcpu); | 75 | rd = kvm_vcpu_dabt_get_rd(vcpu); |
| 72 | addr = hyp_symbol_addr(kvm_vgic_global_state)->vcpu_hyp_va; | 76 | addr = hyp_symbol_addr(kvm_vgic_global_state)->vcpu_hyp_va; |
| @@ -88,5 +92,7 @@ int __hyp_text __vgic_v2_perform_cpuif_access(struct kvm_vcpu *vcpu) | |||
| 88 | vcpu_set_reg(vcpu, rd, data); | 92 | vcpu_set_reg(vcpu, rd, data); |
| 89 | } | 93 | } |
| 90 | 94 | ||
| 95 | __kvm_skip_instr(vcpu); | ||
| 96 | |||
| 91 | return 1; | 97 | return 1; |
| 92 | } | 98 | } |
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 1ca592d38c3c..e3e37228ae4e 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c | |||
| @@ -76,7 +76,7 @@ static bool write_to_read_only(struct kvm_vcpu *vcpu, | |||
| 76 | return false; | 76 | return false; |
| 77 | } | 77 | } |
| 78 | 78 | ||
| 79 | u64 vcpu_read_sys_reg(struct kvm_vcpu *vcpu, int reg) | 79 | u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg) |
| 80 | { | 80 | { |
| 81 | if (!vcpu->arch.sysregs_loaded_on_cpu) | 81 | if (!vcpu->arch.sysregs_loaded_on_cpu) |
| 82 | goto immediate_read; | 82 | goto immediate_read; |
| @@ -1858,6 +1858,8 @@ static void perform_access(struct kvm_vcpu *vcpu, | |||
| 1858 | struct sys_reg_params *params, | 1858 | struct sys_reg_params *params, |
| 1859 | const struct sys_reg_desc *r) | 1859 | const struct sys_reg_desc *r) |
| 1860 | { | 1860 | { |
| 1861 | trace_kvm_sys_access(*vcpu_pc(vcpu), params, r); | ||
| 1862 | |||
| 1861 | /* | 1863 | /* |
| 1862 | * Not having an accessor means that we have configured a trap | 1864 | * Not having an accessor means that we have configured a trap |
| 1863 | * that we don't know how to handle. This certainly qualifies | 1865 | * that we don't know how to handle. This certainly qualifies |
| @@ -1920,8 +1922,8 @@ static void unhandled_cp_access(struct kvm_vcpu *vcpu, | |||
| 1920 | WARN_ON(1); | 1922 | WARN_ON(1); |
| 1921 | } | 1923 | } |
| 1922 | 1924 | ||
| 1923 | kvm_err("Unsupported guest CP%d access at: %08lx\n", | 1925 | kvm_err("Unsupported guest CP%d access at: %08lx [%08lx]\n", |
| 1924 | cp, *vcpu_pc(vcpu)); | 1926 | cp, *vcpu_pc(vcpu), *vcpu_cpsr(vcpu)); |
| 1925 | print_sys_reg_instr(params); | 1927 | print_sys_reg_instr(params); |
| 1926 | kvm_inject_undefined(vcpu); | 1928 | kvm_inject_undefined(vcpu); |
| 1927 | } | 1929 | } |
| @@ -2071,8 +2073,8 @@ static int emulate_sys_reg(struct kvm_vcpu *vcpu, | |||
| 2071 | if (likely(r)) { | 2073 | if (likely(r)) { |
| 2072 | perform_access(vcpu, params, r); | 2074 | perform_access(vcpu, params, r); |
| 2073 | } else { | 2075 | } else { |
| 2074 | kvm_err("Unsupported guest sys_reg access at: %lx\n", | 2076 | kvm_err("Unsupported guest sys_reg access at: %lx [%08lx]\n", |
| 2075 | *vcpu_pc(vcpu)); | 2077 | *vcpu_pc(vcpu), *vcpu_cpsr(vcpu)); |
| 2076 | print_sys_reg_instr(params); | 2078 | print_sys_reg_instr(params); |
| 2077 | kvm_inject_undefined(vcpu); | 2079 | kvm_inject_undefined(vcpu); |
| 2078 | } | 2080 | } |
diff --git a/arch/arm64/kvm/sys_regs.h b/arch/arm64/kvm/sys_regs.h index cd710f8b63e0..3b1bc7f01d0b 100644 --- a/arch/arm64/kvm/sys_regs.h +++ b/arch/arm64/kvm/sys_regs.h | |||
| @@ -35,6 +35,9 @@ struct sys_reg_params { | |||
| 35 | }; | 35 | }; |
| 36 | 36 | ||
| 37 | struct sys_reg_desc { | 37 | struct sys_reg_desc { |
| 38 | /* Sysreg string for debug */ | ||
| 39 | const char *name; | ||
| 40 | |||
| 38 | /* MRS/MSR instruction which accesses it. */ | 41 | /* MRS/MSR instruction which accesses it. */ |
| 39 | u8 Op0; | 42 | u8 Op0; |
| 40 | u8 Op1; | 43 | u8 Op1; |
| @@ -130,6 +133,7 @@ const struct sys_reg_desc *find_reg_by_id(u64 id, | |||
| 130 | #define Op2(_x) .Op2 = _x | 133 | #define Op2(_x) .Op2 = _x |
| 131 | 134 | ||
| 132 | #define SYS_DESC(reg) \ | 135 | #define SYS_DESC(reg) \ |
| 136 | .name = #reg, \ | ||
| 133 | Op0(sys_reg_Op0(reg)), Op1(sys_reg_Op1(reg)), \ | 137 | Op0(sys_reg_Op0(reg)), Op1(sys_reg_Op1(reg)), \ |
| 134 | CRn(sys_reg_CRn(reg)), CRm(sys_reg_CRm(reg)), \ | 138 | CRn(sys_reg_CRn(reg)), CRm(sys_reg_CRm(reg)), \ |
| 135 | Op2(sys_reg_Op2(reg)) | 139 | Op2(sys_reg_Op2(reg)) |
diff --git a/arch/arm64/kvm/trace.h b/arch/arm64/kvm/trace.h index 3b82fb1ddd09..eab91ad0effb 100644 --- a/arch/arm64/kvm/trace.h +++ b/arch/arm64/kvm/trace.h | |||
| @@ -3,6 +3,7 @@ | |||
| 3 | #define _TRACE_ARM64_KVM_H | 3 | #define _TRACE_ARM64_KVM_H |
| 4 | 4 | ||
| 5 | #include <linux/tracepoint.h> | 5 | #include <linux/tracepoint.h> |
| 6 | #include "sys_regs.h" | ||
| 6 | 7 | ||
| 7 | #undef TRACE_SYSTEM | 8 | #undef TRACE_SYSTEM |
| 8 | #define TRACE_SYSTEM kvm | 9 | #define TRACE_SYSTEM kvm |
| @@ -152,6 +153,40 @@ TRACE_EVENT(kvm_handle_sys_reg, | |||
| 152 | TP_printk("HSR 0x%08lx", __entry->hsr) | 153 | TP_printk("HSR 0x%08lx", __entry->hsr) |
| 153 | ); | 154 | ); |
| 154 | 155 | ||
| 156 | TRACE_EVENT(kvm_sys_access, | ||
| 157 | TP_PROTO(unsigned long vcpu_pc, struct sys_reg_params *params, const struct sys_reg_desc *reg), | ||
| 158 | TP_ARGS(vcpu_pc, params, reg), | ||
| 159 | |||
| 160 | TP_STRUCT__entry( | ||
| 161 | __field(unsigned long, vcpu_pc) | ||
| 162 | __field(bool, is_write) | ||
| 163 | __field(const char *, name) | ||
| 164 | __field(u8, Op0) | ||
| 165 | __field(u8, Op1) | ||
| 166 | __field(u8, CRn) | ||
| 167 | __field(u8, CRm) | ||
| 168 | __field(u8, Op2) | ||
| 169 | ), | ||
| 170 | |||
| 171 | TP_fast_assign( | ||
| 172 | __entry->vcpu_pc = vcpu_pc; | ||
| 173 | __entry->is_write = params->is_write; | ||
| 174 | __entry->name = reg->name; | ||
| 175 | __entry->Op0 = reg->Op0; | ||
| 176 | __entry->Op0 = reg->Op0; | ||
| 177 | __entry->Op1 = reg->Op1; | ||
| 178 | __entry->CRn = reg->CRn; | ||
| 179 | __entry->CRm = reg->CRm; | ||
| 180 | __entry->Op2 = reg->Op2; | ||
| 181 | ), | ||
| 182 | |||
| 183 | TP_printk("PC: %lx %s (%d,%d,%d,%d,%d) %s", | ||
| 184 | __entry->vcpu_pc, __entry->name ?: "UNKN", | ||
| 185 | __entry->Op0, __entry->Op1, __entry->CRn, | ||
| 186 | __entry->CRm, __entry->Op2, | ||
| 187 | __entry->is_write ? "write" : "read") | ||
| 188 | ); | ||
| 189 | |||
| 155 | TRACE_EVENT(kvm_set_guest_debug, | 190 | TRACE_EVENT(kvm_set_guest_debug, |
| 156 | TP_PROTO(struct kvm_vcpu *vcpu, __u32 guest_debug), | 191 | TP_PROTO(struct kvm_vcpu *vcpu, __u32 guest_debug), |
| 157 | TP_ARGS(vcpu, guest_debug), | 192 | TP_ARGS(vcpu, guest_debug), |
diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index e445026858bc..d2abd98471e8 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h | |||
| @@ -936,7 +936,7 @@ enum kvm_mips_fault_result kvm_trap_emul_gva_fault(struct kvm_vcpu *vcpu, | |||
| 936 | #define KVM_ARCH_WANT_MMU_NOTIFIER | 936 | #define KVM_ARCH_WANT_MMU_NOTIFIER |
| 937 | int kvm_unmap_hva_range(struct kvm *kvm, | 937 | int kvm_unmap_hva_range(struct kvm *kvm, |
| 938 | unsigned long start, unsigned long end); | 938 | unsigned long start, unsigned long end); |
| 939 | void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); | 939 | int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); |
| 940 | int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end); | 940 | int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end); |
| 941 | int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); | 941 | int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); |
| 942 | 942 | ||
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 1fcc4d149054..3734cd58895e 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c | |||
| @@ -1004,14 +1004,37 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) | |||
| 1004 | { | 1004 | { |
| 1005 | struct kvm_memslots *slots; | 1005 | struct kvm_memslots *slots; |
| 1006 | struct kvm_memory_slot *memslot; | 1006 | struct kvm_memory_slot *memslot; |
| 1007 | bool is_dirty = false; | 1007 | bool flush = false; |
| 1008 | int r; | 1008 | int r; |
| 1009 | 1009 | ||
| 1010 | mutex_lock(&kvm->slots_lock); | 1010 | mutex_lock(&kvm->slots_lock); |
| 1011 | 1011 | ||
| 1012 | r = kvm_get_dirty_log_protect(kvm, log, &is_dirty); | 1012 | r = kvm_get_dirty_log_protect(kvm, log, &flush); |
| 1013 | 1013 | ||
| 1014 | if (is_dirty) { | 1014 | if (flush) { |
| 1015 | slots = kvm_memslots(kvm); | ||
| 1016 | memslot = id_to_memslot(slots, log->slot); | ||
| 1017 | |||
| 1018 | /* Let implementation handle TLB/GVA invalidation */ | ||
| 1019 | kvm_mips_callbacks->flush_shadow_memslot(kvm, memslot); | ||
| 1020 | } | ||
| 1021 | |||
| 1022 | mutex_unlock(&kvm->slots_lock); | ||
| 1023 | return r; | ||
| 1024 | } | ||
| 1025 | |||
| 1026 | int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm, struct kvm_clear_dirty_log *log) | ||
| 1027 | { | ||
| 1028 | struct kvm_memslots *slots; | ||
| 1029 | struct kvm_memory_slot *memslot; | ||
| 1030 | bool flush = false; | ||
| 1031 | int r; | ||
| 1032 | |||
| 1033 | mutex_lock(&kvm->slots_lock); | ||
| 1034 | |||
| 1035 | r = kvm_clear_dirty_log_protect(kvm, log, &flush); | ||
| 1036 | |||
| 1037 | if (flush) { | ||
| 1015 | slots = kvm_memslots(kvm); | 1038 | slots = kvm_memslots(kvm); |
| 1016 | memslot = id_to_memslot(slots, log->slot); | 1039 | memslot = id_to_memslot(slots, log->slot); |
| 1017 | 1040 | ||
diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c index d8dcdb350405..97e538a8c1be 100644 --- a/arch/mips/kvm/mmu.c +++ b/arch/mips/kvm/mmu.c | |||
| @@ -551,7 +551,7 @@ static int kvm_set_spte_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end, | |||
| 551 | (pte_dirty(old_pte) && !pte_dirty(hva_pte)); | 551 | (pte_dirty(old_pte) && !pte_dirty(hva_pte)); |
| 552 | } | 552 | } |
| 553 | 553 | ||
| 554 | void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) | 554 | int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) |
| 555 | { | 555 | { |
| 556 | unsigned long end = hva + PAGE_SIZE; | 556 | unsigned long end = hva + PAGE_SIZE; |
| 557 | int ret; | 557 | int ret; |
| @@ -559,6 +559,7 @@ void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) | |||
| 559 | ret = handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &pte); | 559 | ret = handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &pte); |
| 560 | if (ret) | 560 | if (ret) |
| 561 | kvm_mips_callbacks->flush_shadow_all(kvm); | 561 | kvm_mips_callbacks->flush_shadow_all(kvm); |
| 562 | return 0; | ||
| 562 | } | 563 | } |
| 563 | 564 | ||
| 564 | static int kvm_age_hva_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end, | 565 | static int kvm_age_hva_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end, |
diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h index 33a4fc891947..463c63a9fcf1 100644 --- a/arch/powerpc/include/asm/hvcall.h +++ b/arch/powerpc/include/asm/hvcall.h | |||
| @@ -335,6 +335,7 @@ | |||
| 335 | #define H_SET_PARTITION_TABLE 0xF800 | 335 | #define H_SET_PARTITION_TABLE 0xF800 |
| 336 | #define H_ENTER_NESTED 0xF804 | 336 | #define H_ENTER_NESTED 0xF804 |
| 337 | #define H_TLB_INVALIDATE 0xF808 | 337 | #define H_TLB_INVALIDATE 0xF808 |
| 338 | #define H_COPY_TOFROM_GUEST 0xF80C | ||
| 338 | 339 | ||
| 339 | /* Values for 2nd argument to H_SET_MODE */ | 340 | /* Values for 2nd argument to H_SET_MODE */ |
| 340 | #define H_SET_MODE_RESOURCE_SET_CIABR 1 | 341 | #define H_SET_MODE_RESOURCE_SET_CIABR 1 |
diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index 09f8e9ba69bc..38f1b879f569 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h | |||
| @@ -188,6 +188,13 @@ extern int kvmppc_book3s_hcall_implemented(struct kvm *kvm, unsigned long hc); | |||
| 188 | extern int kvmppc_book3s_radix_page_fault(struct kvm_run *run, | 188 | extern int kvmppc_book3s_radix_page_fault(struct kvm_run *run, |
| 189 | struct kvm_vcpu *vcpu, | 189 | struct kvm_vcpu *vcpu, |
| 190 | unsigned long ea, unsigned long dsisr); | 190 | unsigned long ea, unsigned long dsisr); |
| 191 | extern unsigned long __kvmhv_copy_tofrom_guest_radix(int lpid, int pid, | ||
| 192 | gva_t eaddr, void *to, void *from, | ||
| 193 | unsigned long n); | ||
| 194 | extern long kvmhv_copy_from_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr, | ||
| 195 | void *to, unsigned long n); | ||
| 196 | extern long kvmhv_copy_to_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr, | ||
| 197 | void *from, unsigned long n); | ||
| 191 | extern int kvmppc_mmu_walk_radix_tree(struct kvm_vcpu *vcpu, gva_t eaddr, | 198 | extern int kvmppc_mmu_walk_radix_tree(struct kvm_vcpu *vcpu, gva_t eaddr, |
| 192 | struct kvmppc_pte *gpte, u64 root, | 199 | struct kvmppc_pte *gpte, u64 root, |
| 193 | u64 *pte_ret_p); | 200 | u64 *pte_ret_p); |
| @@ -196,8 +203,11 @@ extern int kvmppc_mmu_radix_translate_table(struct kvm_vcpu *vcpu, gva_t eaddr, | |||
| 196 | int table_index, u64 *pte_ret_p); | 203 | int table_index, u64 *pte_ret_p); |
| 197 | extern int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, | 204 | extern int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, |
| 198 | struct kvmppc_pte *gpte, bool data, bool iswrite); | 205 | struct kvmppc_pte *gpte, bool data, bool iswrite); |
| 206 | extern void kvmppc_radix_tlbie_page(struct kvm *kvm, unsigned long addr, | ||
| 207 | unsigned int pshift, unsigned int lpid); | ||
| 199 | extern void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte, unsigned long gpa, | 208 | extern void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte, unsigned long gpa, |
| 200 | unsigned int shift, struct kvm_memory_slot *memslot, | 209 | unsigned int shift, |
| 210 | const struct kvm_memory_slot *memslot, | ||
| 201 | unsigned int lpid); | 211 | unsigned int lpid); |
| 202 | extern bool kvmppc_hv_handle_set_rc(struct kvm *kvm, pgd_t *pgtable, | 212 | extern bool kvmppc_hv_handle_set_rc(struct kvm *kvm, pgd_t *pgtable, |
| 203 | bool writing, unsigned long gpa, | 213 | bool writing, unsigned long gpa, |
| @@ -215,16 +225,14 @@ extern int kvmppc_radix_init(void); | |||
| 215 | extern void kvmppc_radix_exit(void); | 225 | extern void kvmppc_radix_exit(void); |
| 216 | extern int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, | 226 | extern int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, |
| 217 | unsigned long gfn); | 227 | unsigned long gfn); |
| 218 | extern void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte, | ||
| 219 | unsigned long gpa, unsigned int shift, | ||
| 220 | struct kvm_memory_slot *memslot, | ||
| 221 | unsigned int lpid); | ||
| 222 | extern int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, | 228 | extern int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, |
| 223 | unsigned long gfn); | 229 | unsigned long gfn); |
| 224 | extern int kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, | 230 | extern int kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, |
| 225 | unsigned long gfn); | 231 | unsigned long gfn); |
| 226 | extern long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm, | 232 | extern long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm, |
| 227 | struct kvm_memory_slot *memslot, unsigned long *map); | 233 | struct kvm_memory_slot *memslot, unsigned long *map); |
| 234 | extern void kvmppc_radix_flush_memslot(struct kvm *kvm, | ||
| 235 | const struct kvm_memory_slot *memslot); | ||
| 228 | extern int kvmhv_get_rmmu_info(struct kvm *kvm, struct kvm_ppc_rmmu_info *info); | 236 | extern int kvmhv_get_rmmu_info(struct kvm *kvm, struct kvm_ppc_rmmu_info *info); |
| 229 | 237 | ||
| 230 | /* XXX remove this export when load_last_inst() is generic */ | 238 | /* XXX remove this export when load_last_inst() is generic */ |
| @@ -242,7 +250,7 @@ extern kvm_pfn_t kvmppc_gpa_to_pfn(struct kvm_vcpu *vcpu, gpa_t gpa, | |||
| 242 | bool writing, bool *writable); | 250 | bool writing, bool *writable); |
| 243 | extern void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev, | 251 | extern void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev, |
| 244 | unsigned long *rmap, long pte_index, int realmode); | 252 | unsigned long *rmap, long pte_index, int realmode); |
| 245 | extern void kvmppc_update_dirty_map(struct kvm_memory_slot *memslot, | 253 | extern void kvmppc_update_dirty_map(const struct kvm_memory_slot *memslot, |
| 246 | unsigned long gfn, unsigned long psize); | 254 | unsigned long gfn, unsigned long psize); |
| 247 | extern void kvmppc_invalidate_hpte(struct kvm *kvm, __be64 *hptep, | 255 | extern void kvmppc_invalidate_hpte(struct kvm *kvm, __be64 *hptep, |
| 248 | unsigned long pte_index); | 256 | unsigned long pte_index); |
| @@ -298,6 +306,7 @@ long kvmhv_nested_init(void); | |||
| 298 | void kvmhv_nested_exit(void); | 306 | void kvmhv_nested_exit(void); |
| 299 | void kvmhv_vm_nested_init(struct kvm *kvm); | 307 | void kvmhv_vm_nested_init(struct kvm *kvm); |
| 300 | long kvmhv_set_partition_table(struct kvm_vcpu *vcpu); | 308 | long kvmhv_set_partition_table(struct kvm_vcpu *vcpu); |
| 309 | long kvmhv_copy_tofrom_guest_nested(struct kvm_vcpu *vcpu); | ||
| 301 | void kvmhv_set_ptbl_entry(unsigned int lpid, u64 dw0, u64 dw1); | 310 | void kvmhv_set_ptbl_entry(unsigned int lpid, u64 dw0, u64 dw1); |
| 302 | void kvmhv_release_all_nested(struct kvm *kvm); | 311 | void kvmhv_release_all_nested(struct kvm *kvm); |
| 303 | long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu); | 312 | long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu); |
| @@ -307,7 +316,7 @@ int kvmhv_run_single_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu, | |||
| 307 | void kvmhv_save_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr); | 316 | void kvmhv_save_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr); |
| 308 | void kvmhv_restore_hv_return_state(struct kvm_vcpu *vcpu, | 317 | void kvmhv_restore_hv_return_state(struct kvm_vcpu *vcpu, |
| 309 | struct hv_guest_state *hr); | 318 | struct hv_guest_state *hr); |
| 310 | long int kvmhv_nested_page_fault(struct kvm_vcpu *vcpu); | 319 | long int kvmhv_nested_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu); |
| 311 | 320 | ||
| 312 | void kvmppc_giveup_fac(struct kvm_vcpu *vcpu, ulong fac); | 321 | void kvmppc_giveup_fac(struct kvm_vcpu *vcpu, ulong fac); |
| 313 | 322 | ||
diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h index 6d298145d564..21b1ed5df888 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/arch/powerpc/include/asm/kvm_book3s_64.h | |||
| @@ -55,6 +55,7 @@ struct kvm_nested_guest { | |||
| 55 | cpumask_t need_tlb_flush; | 55 | cpumask_t need_tlb_flush; |
| 56 | cpumask_t cpu_in_guest; | 56 | cpumask_t cpu_in_guest; |
| 57 | short prev_cpu[NR_CPUS]; | 57 | short prev_cpu[NR_CPUS]; |
| 58 | u8 radix; /* is this nested guest radix */ | ||
| 58 | }; | 59 | }; |
| 59 | 60 | ||
| 60 | /* | 61 | /* |
| @@ -150,6 +151,18 @@ static inline bool kvm_is_radix(struct kvm *kvm) | |||
| 150 | return kvm->arch.radix; | 151 | return kvm->arch.radix; |
| 151 | } | 152 | } |
| 152 | 153 | ||
| 154 | static inline bool kvmhv_vcpu_is_radix(struct kvm_vcpu *vcpu) | ||
| 155 | { | ||
| 156 | bool radix; | ||
| 157 | |||
| 158 | if (vcpu->arch.nested) | ||
| 159 | radix = vcpu->arch.nested->radix; | ||
| 160 | else | ||
| 161 | radix = kvm_is_radix(vcpu->kvm); | ||
| 162 | |||
| 163 | return radix; | ||
| 164 | } | ||
| 165 | |||
| 153 | #define KVM_DEFAULT_HPT_ORDER 24 /* 16MB HPT by default */ | 166 | #define KVM_DEFAULT_HPT_ORDER 24 /* 16MB HPT by default */ |
| 154 | #endif | 167 | #endif |
| 155 | 168 | ||
| @@ -624,8 +637,11 @@ extern int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte, | |||
| 624 | unsigned long *rmapp, struct rmap_nested **n_rmap); | 637 | unsigned long *rmapp, struct rmap_nested **n_rmap); |
| 625 | extern void kvmhv_insert_nest_rmap(struct kvm *kvm, unsigned long *rmapp, | 638 | extern void kvmhv_insert_nest_rmap(struct kvm *kvm, unsigned long *rmapp, |
| 626 | struct rmap_nested **n_rmap); | 639 | struct rmap_nested **n_rmap); |
| 640 | extern void kvmhv_update_nest_rmap_rc_list(struct kvm *kvm, unsigned long *rmapp, | ||
| 641 | unsigned long clr, unsigned long set, | ||
| 642 | unsigned long hpa, unsigned long nbytes); | ||
| 627 | extern void kvmhv_remove_nest_rmap_range(struct kvm *kvm, | 643 | extern void kvmhv_remove_nest_rmap_range(struct kvm *kvm, |
| 628 | struct kvm_memory_slot *memslot, | 644 | const struct kvm_memory_slot *memslot, |
| 629 | unsigned long gpa, unsigned long hpa, | 645 | unsigned long gpa, unsigned long hpa, |
| 630 | unsigned long nbytes); | 646 | unsigned long nbytes); |
| 631 | 647 | ||
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index fac6f631ed29..0f98f00da2ea 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h | |||
| @@ -72,7 +72,7 @@ extern int kvm_unmap_hva_range(struct kvm *kvm, | |||
| 72 | unsigned long start, unsigned long end); | 72 | unsigned long start, unsigned long end); |
| 73 | extern int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end); | 73 | extern int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end); |
| 74 | extern int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); | 74 | extern int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); |
| 75 | extern void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); | 75 | extern int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); |
| 76 | 76 | ||
| 77 | #define HPTEG_CACHE_NUM (1 << 15) | 77 | #define HPTEG_CACHE_NUM (1 << 15) |
| 78 | #define HPTEG_HASH_BITS_PTE 13 | 78 | #define HPTEG_HASH_BITS_PTE 13 |
| @@ -793,6 +793,7 @@ struct kvm_vcpu_arch { | |||
| 793 | /* For support of nested guests */ | 793 | /* For support of nested guests */ |
| 794 | struct kvm_nested_guest *nested; | 794 | struct kvm_nested_guest *nested; |
| 795 | u32 nested_vcpu_id; | 795 | u32 nested_vcpu_id; |
| 796 | gpa_t nested_io_gpr; | ||
| 796 | #endif | 797 | #endif |
| 797 | 798 | ||
| 798 | #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING | 799 | #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING |
| @@ -827,6 +828,8 @@ struct kvm_vcpu_arch { | |||
| 827 | #define KVM_MMIO_REG_FQPR 0x00c0 | 828 | #define KVM_MMIO_REG_FQPR 0x00c0 |
| 828 | #define KVM_MMIO_REG_VSX 0x0100 | 829 | #define KVM_MMIO_REG_VSX 0x0100 |
| 829 | #define KVM_MMIO_REG_VMX 0x0180 | 830 | #define KVM_MMIO_REG_VMX 0x0180 |
| 831 | #define KVM_MMIO_REG_NESTED_GPR 0xffc0 | ||
| 832 | |||
| 830 | 833 | ||
| 831 | #define __KVM_HAVE_ARCH_WQP | 834 | #define __KVM_HAVE_ARCH_WQP |
| 832 | #define __KVM_HAVE_CREATE_DEVICE | 835 | #define __KVM_HAVE_CREATE_DEVICE |
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 9b89b1918dfc..eb0d79f0ca45 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h | |||
| @@ -224,7 +224,8 @@ extern int kvmppc_core_prepare_memory_region(struct kvm *kvm, | |||
| 224 | extern void kvmppc_core_commit_memory_region(struct kvm *kvm, | 224 | extern void kvmppc_core_commit_memory_region(struct kvm *kvm, |
| 225 | const struct kvm_userspace_memory_region *mem, | 225 | const struct kvm_userspace_memory_region *mem, |
| 226 | const struct kvm_memory_slot *old, | 226 | const struct kvm_memory_slot *old, |
| 227 | const struct kvm_memory_slot *new); | 227 | const struct kvm_memory_slot *new, |
| 228 | enum kvm_mr_change change); | ||
| 228 | extern int kvm_vm_ioctl_get_smmu_info(struct kvm *kvm, | 229 | extern int kvm_vm_ioctl_get_smmu_info(struct kvm *kvm, |
| 229 | struct kvm_ppc_smmu_info *info); | 230 | struct kvm_ppc_smmu_info *info); |
| 230 | extern void kvmppc_core_flush_memslot(struct kvm *kvm, | 231 | extern void kvmppc_core_flush_memslot(struct kvm *kvm, |
| @@ -294,7 +295,8 @@ struct kvmppc_ops { | |||
| 294 | void (*commit_memory_region)(struct kvm *kvm, | 295 | void (*commit_memory_region)(struct kvm *kvm, |
| 295 | const struct kvm_userspace_memory_region *mem, | 296 | const struct kvm_userspace_memory_region *mem, |
| 296 | const struct kvm_memory_slot *old, | 297 | const struct kvm_memory_slot *old, |
| 297 | const struct kvm_memory_slot *new); | 298 | const struct kvm_memory_slot *new, |
| 299 | enum kvm_mr_change change); | ||
| 298 | int (*unmap_hva_range)(struct kvm *kvm, unsigned long start, | 300 | int (*unmap_hva_range)(struct kvm *kvm, unsigned long start, |
| 299 | unsigned long end); | 301 | unsigned long end); |
| 300 | int (*age_hva)(struct kvm *kvm, unsigned long start, unsigned long end); | 302 | int (*age_hva)(struct kvm *kvm, unsigned long start, unsigned long end); |
| @@ -326,6 +328,10 @@ struct kvmppc_ops { | |||
| 326 | unsigned long flags); | 328 | unsigned long flags); |
| 327 | void (*giveup_ext)(struct kvm_vcpu *vcpu, ulong msr); | 329 | void (*giveup_ext)(struct kvm_vcpu *vcpu, ulong msr); |
| 328 | int (*enable_nested)(struct kvm *kvm); | 330 | int (*enable_nested)(struct kvm *kvm); |
| 331 | int (*load_from_eaddr)(struct kvm_vcpu *vcpu, ulong *eaddr, void *ptr, | ||
| 332 | int size); | ||
| 333 | int (*store_to_eaddr)(struct kvm_vcpu *vcpu, ulong *eaddr, void *ptr, | ||
| 334 | int size); | ||
| 329 | }; | 335 | }; |
| 330 | 336 | ||
| 331 | extern struct kvmppc_ops *kvmppc_hv_ops; | 337 | extern struct kvmppc_ops *kvmppc_hv_ops; |
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 89d32bb79d5e..db2691ff4c0b 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S | |||
| @@ -995,7 +995,16 @@ EXC_COMMON_BEGIN(h_data_storage_common) | |||
| 995 | bl save_nvgprs | 995 | bl save_nvgprs |
| 996 | RECONCILE_IRQ_STATE(r10, r11) | 996 | RECONCILE_IRQ_STATE(r10, r11) |
| 997 | addi r3,r1,STACK_FRAME_OVERHEAD | 997 | addi r3,r1,STACK_FRAME_OVERHEAD |
| 998 | BEGIN_MMU_FTR_SECTION | ||
| 999 | ld r4,PACA_EXGEN+EX_DAR(r13) | ||
| 1000 | lwz r5,PACA_EXGEN+EX_DSISR(r13) | ||
| 1001 | std r4,_DAR(r1) | ||
| 1002 | std r5,_DSISR(r1) | ||
| 1003 | li r5,SIGSEGV | ||
| 1004 | bl bad_page_fault | ||
| 1005 | MMU_FTR_SECTION_ELSE | ||
| 998 | bl unknown_exception | 1006 | bl unknown_exception |
| 1007 | ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_TYPE_RADIX) | ||
| 999 | b ret_from_except | 1008 | b ret_from_except |
| 1000 | 1009 | ||
| 1001 | 1010 | ||
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index fd9893bc7aa1..bd1a677dd9e4 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c | |||
| @@ -830,9 +830,10 @@ int kvmppc_core_prepare_memory_region(struct kvm *kvm, | |||
| 830 | void kvmppc_core_commit_memory_region(struct kvm *kvm, | 830 | void kvmppc_core_commit_memory_region(struct kvm *kvm, |
| 831 | const struct kvm_userspace_memory_region *mem, | 831 | const struct kvm_userspace_memory_region *mem, |
| 832 | const struct kvm_memory_slot *old, | 832 | const struct kvm_memory_slot *old, |
| 833 | const struct kvm_memory_slot *new) | 833 | const struct kvm_memory_slot *new, |
| 834 | enum kvm_mr_change change) | ||
| 834 | { | 835 | { |
| 835 | kvm->arch.kvm_ops->commit_memory_region(kvm, mem, old, new); | 836 | kvm->arch.kvm_ops->commit_memory_region(kvm, mem, old, new, change); |
| 836 | } | 837 | } |
| 837 | 838 | ||
| 838 | int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end) | 839 | int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end) |
| @@ -850,9 +851,10 @@ int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) | |||
| 850 | return kvm->arch.kvm_ops->test_age_hva(kvm, hva); | 851 | return kvm->arch.kvm_ops->test_age_hva(kvm, hva); |
| 851 | } | 852 | } |
| 852 | 853 | ||
| 853 | void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) | 854 | int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) |
| 854 | { | 855 | { |
| 855 | kvm->arch.kvm_ops->set_spte_hva(kvm, hva, pte); | 856 | kvm->arch.kvm_ops->set_spte_hva(kvm, hva, pte); |
| 857 | return 0; | ||
| 856 | } | 858 | } |
| 857 | 859 | ||
| 858 | void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu) | 860 | void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu) |
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index c615617e78ac..6f2d2fb4e098 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c | |||
| @@ -743,12 +743,15 @@ void kvmppc_rmap_reset(struct kvm *kvm) | |||
| 743 | srcu_idx = srcu_read_lock(&kvm->srcu); | 743 | srcu_idx = srcu_read_lock(&kvm->srcu); |
| 744 | slots = kvm_memslots(kvm); | 744 | slots = kvm_memslots(kvm); |
| 745 | kvm_for_each_memslot(memslot, slots) { | 745 | kvm_for_each_memslot(memslot, slots) { |
| 746 | /* Mutual exclusion with kvm_unmap_hva_range etc. */ | ||
| 747 | spin_lock(&kvm->mmu_lock); | ||
| 746 | /* | 748 | /* |
| 747 | * This assumes it is acceptable to lose reference and | 749 | * This assumes it is acceptable to lose reference and |
| 748 | * change bits across a reset. | 750 | * change bits across a reset. |
| 749 | */ | 751 | */ |
| 750 | memset(memslot->arch.rmap, 0, | 752 | memset(memslot->arch.rmap, 0, |
| 751 | memslot->npages * sizeof(*memslot->arch.rmap)); | 753 | memslot->npages * sizeof(*memslot->arch.rmap)); |
| 754 | spin_unlock(&kvm->mmu_lock); | ||
| 752 | } | 755 | } |
| 753 | srcu_read_unlock(&kvm->srcu, srcu_idx); | 756 | srcu_read_unlock(&kvm->srcu, srcu_idx); |
| 754 | } | 757 | } |
| @@ -896,11 +899,12 @@ void kvmppc_core_flush_memslot_hv(struct kvm *kvm, | |||
| 896 | 899 | ||
| 897 | gfn = memslot->base_gfn; | 900 | gfn = memslot->base_gfn; |
| 898 | rmapp = memslot->arch.rmap; | 901 | rmapp = memslot->arch.rmap; |
| 902 | if (kvm_is_radix(kvm)) { | ||
| 903 | kvmppc_radix_flush_memslot(kvm, memslot); | ||
| 904 | return; | ||
| 905 | } | ||
| 906 | |||
| 899 | for (n = memslot->npages; n; --n, ++gfn) { | 907 | for (n = memslot->npages; n; --n, ++gfn) { |
| 900 | if (kvm_is_radix(kvm)) { | ||
| 901 | kvm_unmap_radix(kvm, memslot, gfn); | ||
| 902 | continue; | ||
| 903 | } | ||
| 904 | /* | 908 | /* |
| 905 | * Testing the present bit without locking is OK because | 909 | * Testing the present bit without locking is OK because |
| 906 | * the memslot has been marked invalid already, and hence | 910 | * the memslot has been marked invalid already, and hence |
diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index d68162ee159b..fb88167a402a 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c | |||
| @@ -29,6 +29,103 @@ | |||
| 29 | */ | 29 | */ |
| 30 | static int p9_supported_radix_bits[4] = { 5, 9, 9, 13 }; | 30 | static int p9_supported_radix_bits[4] = { 5, 9, 9, 13 }; |
| 31 | 31 | ||
| 32 | unsigned long __kvmhv_copy_tofrom_guest_radix(int lpid, int pid, | ||
| 33 | gva_t eaddr, void *to, void *from, | ||
| 34 | unsigned long n) | ||
| 35 | { | ||
| 36 | unsigned long quadrant, ret = n; | ||
| 37 | int old_pid, old_lpid; | ||
| 38 | bool is_load = !!to; | ||
| 39 | |||
| 40 | /* Can't access quadrants 1 or 2 in non-HV mode, call the HV to do it */ | ||
| 41 | if (kvmhv_on_pseries()) | ||
| 42 | return plpar_hcall_norets(H_COPY_TOFROM_GUEST, lpid, pid, eaddr, | ||
| 43 | __pa(to), __pa(from), n); | ||
| 44 | |||
| 45 | quadrant = 1; | ||
| 46 | if (!pid) | ||
| 47 | quadrant = 2; | ||
| 48 | if (is_load) | ||
| 49 | from = (void *) (eaddr | (quadrant << 62)); | ||
| 50 | else | ||
| 51 | to = (void *) (eaddr | (quadrant << 62)); | ||
| 52 | |||
| 53 | preempt_disable(); | ||
| 54 | |||
| 55 | /* switch the lpid first to avoid running host with unallocated pid */ | ||
| 56 | old_lpid = mfspr(SPRN_LPID); | ||
| 57 | if (old_lpid != lpid) | ||
| 58 | mtspr(SPRN_LPID, lpid); | ||
| 59 | if (quadrant == 1) { | ||
| 60 | old_pid = mfspr(SPRN_PID); | ||
| 61 | if (old_pid != pid) | ||
| 62 | mtspr(SPRN_PID, pid); | ||
| 63 | } | ||
| 64 | isync(); | ||
| 65 | |||
| 66 | pagefault_disable(); | ||
| 67 | if (is_load) | ||
| 68 | ret = raw_copy_from_user(to, from, n); | ||
| 69 | else | ||
| 70 | ret = raw_copy_to_user(to, from, n); | ||
| 71 | pagefault_enable(); | ||
| 72 | |||
| 73 | /* switch the pid first to avoid running host with unallocated pid */ | ||
| 74 | if (quadrant == 1 && pid != old_pid) | ||
| 75 | mtspr(SPRN_PID, old_pid); | ||
| 76 | if (lpid != old_lpid) | ||
| 77 | mtspr(SPRN_LPID, old_lpid); | ||
| 78 | isync(); | ||
| 79 | |||
| 80 | preempt_enable(); | ||
| 81 | |||
| 82 | return ret; | ||
| 83 | } | ||
| 84 | EXPORT_SYMBOL_GPL(__kvmhv_copy_tofrom_guest_radix); | ||
| 85 | |||
| 86 | static long kvmhv_copy_tofrom_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr, | ||
| 87 | void *to, void *from, unsigned long n) | ||
| 88 | { | ||
| 89 | int lpid = vcpu->kvm->arch.lpid; | ||
| 90 | int pid = vcpu->arch.pid; | ||
| 91 | |||
| 92 | /* This would cause a data segment intr so don't allow the access */ | ||
| 93 | if (eaddr & (0x3FFUL << 52)) | ||
| 94 | return -EINVAL; | ||
| 95 | |||
| 96 | /* Should we be using the nested lpid */ | ||
| 97 | if (vcpu->arch.nested) | ||
| 98 | lpid = vcpu->arch.nested->shadow_lpid; | ||
| 99 | |||
| 100 | /* If accessing quadrant 3 then pid is expected to be 0 */ | ||
| 101 | if (((eaddr >> 62) & 0x3) == 0x3) | ||
| 102 | pid = 0; | ||
| 103 | |||
| 104 | eaddr &= ~(0xFFFUL << 52); | ||
| 105 | |||
| 106 | return __kvmhv_copy_tofrom_guest_radix(lpid, pid, eaddr, to, from, n); | ||
| 107 | } | ||
| 108 | |||
| 109 | long kvmhv_copy_from_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr, void *to, | ||
| 110 | unsigned long n) | ||
| 111 | { | ||
| 112 | long ret; | ||
| 113 | |||
| 114 | ret = kvmhv_copy_tofrom_guest_radix(vcpu, eaddr, to, NULL, n); | ||
| 115 | if (ret > 0) | ||
| 116 | memset(to + (n - ret), 0, ret); | ||
| 117 | |||
| 118 | return ret; | ||
| 119 | } | ||
| 120 | EXPORT_SYMBOL_GPL(kvmhv_copy_from_guest_radix); | ||
| 121 | |||
| 122 | long kvmhv_copy_to_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr, void *from, | ||
| 123 | unsigned long n) | ||
| 124 | { | ||
| 125 | return kvmhv_copy_tofrom_guest_radix(vcpu, eaddr, NULL, from, n); | ||
| 126 | } | ||
| 127 | EXPORT_SYMBOL_GPL(kvmhv_copy_to_guest_radix); | ||
| 128 | |||
| 32 | int kvmppc_mmu_walk_radix_tree(struct kvm_vcpu *vcpu, gva_t eaddr, | 129 | int kvmppc_mmu_walk_radix_tree(struct kvm_vcpu *vcpu, gva_t eaddr, |
| 33 | struct kvmppc_pte *gpte, u64 root, | 130 | struct kvmppc_pte *gpte, u64 root, |
| 34 | u64 *pte_ret_p) | 131 | u64 *pte_ret_p) |
| @@ -197,8 +294,8 @@ int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, | |||
| 197 | return 0; | 294 | return 0; |
| 198 | } | 295 | } |
| 199 | 296 | ||
| 200 | static void kvmppc_radix_tlbie_page(struct kvm *kvm, unsigned long addr, | 297 | void kvmppc_radix_tlbie_page(struct kvm *kvm, unsigned long addr, |
| 201 | unsigned int pshift, unsigned int lpid) | 298 | unsigned int pshift, unsigned int lpid) |
| 202 | { | 299 | { |
| 203 | unsigned long psize = PAGE_SIZE; | 300 | unsigned long psize = PAGE_SIZE; |
| 204 | int psi; | 301 | int psi; |
| @@ -284,7 +381,8 @@ static void kvmppc_pmd_free(pmd_t *pmdp) | |||
| 284 | 381 | ||
| 285 | /* Called with kvm->mmu_lock held */ | 382 | /* Called with kvm->mmu_lock held */ |
| 286 | void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte, unsigned long gpa, | 383 | void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte, unsigned long gpa, |
| 287 | unsigned int shift, struct kvm_memory_slot *memslot, | 384 | unsigned int shift, |
| 385 | const struct kvm_memory_slot *memslot, | ||
| 288 | unsigned int lpid) | 386 | unsigned int lpid) |
| 289 | 387 | ||
| 290 | { | 388 | { |
| @@ -683,6 +781,7 @@ int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu, | |||
| 683 | pte_t pte, *ptep; | 781 | pte_t pte, *ptep; |
| 684 | unsigned int shift, level; | 782 | unsigned int shift, level; |
| 685 | int ret; | 783 | int ret; |
| 784 | bool large_enable; | ||
| 686 | 785 | ||
| 687 | /* used to check for invalidations in progress */ | 786 | /* used to check for invalidations in progress */ |
| 688 | mmu_seq = kvm->mmu_notifier_seq; | 787 | mmu_seq = kvm->mmu_notifier_seq; |
| @@ -732,12 +831,15 @@ int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu, | |||
| 732 | pte = *ptep; | 831 | pte = *ptep; |
| 733 | local_irq_enable(); | 832 | local_irq_enable(); |
| 734 | 833 | ||
| 834 | /* If we're logging dirty pages, always map single pages */ | ||
| 835 | large_enable = !(memslot->flags & KVM_MEM_LOG_DIRTY_PAGES); | ||
| 836 | |||
| 735 | /* Get pte level from shift/size */ | 837 | /* Get pte level from shift/size */ |
| 736 | if (shift == PUD_SHIFT && | 838 | if (large_enable && shift == PUD_SHIFT && |
| 737 | (gpa & (PUD_SIZE - PAGE_SIZE)) == | 839 | (gpa & (PUD_SIZE - PAGE_SIZE)) == |
| 738 | (hva & (PUD_SIZE - PAGE_SIZE))) { | 840 | (hva & (PUD_SIZE - PAGE_SIZE))) { |
| 739 | level = 2; | 841 | level = 2; |
| 740 | } else if (shift == PMD_SHIFT && | 842 | } else if (large_enable && shift == PMD_SHIFT && |
| 741 | (gpa & (PMD_SIZE - PAGE_SIZE)) == | 843 | (gpa & (PMD_SIZE - PAGE_SIZE)) == |
| 742 | (hva & (PMD_SIZE - PAGE_SIZE))) { | 844 | (hva & (PMD_SIZE - PAGE_SIZE))) { |
| 743 | level = 1; | 845 | level = 1; |
| @@ -857,7 +959,7 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, | |||
| 857 | return ret; | 959 | return ret; |
| 858 | } | 960 | } |
| 859 | 961 | ||
| 860 | /* Called with kvm->lock held */ | 962 | /* Called with kvm->mmu_lock held */ |
| 861 | int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, | 963 | int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, |
| 862 | unsigned long gfn) | 964 | unsigned long gfn) |
| 863 | { | 965 | { |
| @@ -872,7 +974,7 @@ int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, | |||
| 872 | return 0; | 974 | return 0; |
| 873 | } | 975 | } |
| 874 | 976 | ||
| 875 | /* Called with kvm->lock held */ | 977 | /* Called with kvm->mmu_lock held */ |
| 876 | int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, | 978 | int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, |
| 877 | unsigned long gfn) | 979 | unsigned long gfn) |
| 878 | { | 980 | { |
| @@ -880,18 +982,24 @@ int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, | |||
| 880 | unsigned long gpa = gfn << PAGE_SHIFT; | 982 | unsigned long gpa = gfn << PAGE_SHIFT; |
| 881 | unsigned int shift; | 983 | unsigned int shift; |
| 882 | int ref = 0; | 984 | int ref = 0; |
| 985 | unsigned long old, *rmapp; | ||
| 883 | 986 | ||
| 884 | ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift); | 987 | ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift); |
| 885 | if (ptep && pte_present(*ptep) && pte_young(*ptep)) { | 988 | if (ptep && pte_present(*ptep) && pte_young(*ptep)) { |
| 886 | kvmppc_radix_update_pte(kvm, ptep, _PAGE_ACCESSED, 0, | 989 | old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_ACCESSED, 0, |
| 887 | gpa, shift); | 990 | gpa, shift); |
| 888 | /* XXX need to flush tlb here? */ | 991 | /* XXX need to flush tlb here? */ |
| 992 | /* Also clear bit in ptes in shadow pgtable for nested guests */ | ||
| 993 | rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn]; | ||
| 994 | kvmhv_update_nest_rmap_rc_list(kvm, rmapp, _PAGE_ACCESSED, 0, | ||
| 995 | old & PTE_RPN_MASK, | ||
| 996 | 1UL << shift); | ||
| 889 | ref = 1; | 997 | ref = 1; |
| 890 | } | 998 | } |
| 891 | return ref; | 999 | return ref; |
| 892 | } | 1000 | } |
| 893 | 1001 | ||
| 894 | /* Called with kvm->lock held */ | 1002 | /* Called with kvm->mmu_lock held */ |
| 895 | int kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, | 1003 | int kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, |
| 896 | unsigned long gfn) | 1004 | unsigned long gfn) |
| 897 | { | 1005 | { |
| @@ -915,15 +1023,23 @@ static int kvm_radix_test_clear_dirty(struct kvm *kvm, | |||
| 915 | pte_t *ptep; | 1023 | pte_t *ptep; |
| 916 | unsigned int shift; | 1024 | unsigned int shift; |
| 917 | int ret = 0; | 1025 | int ret = 0; |
| 1026 | unsigned long old, *rmapp; | ||
| 918 | 1027 | ||
| 919 | ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift); | 1028 | ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift); |
| 920 | if (ptep && pte_present(*ptep) && pte_dirty(*ptep)) { | 1029 | if (ptep && pte_present(*ptep) && pte_dirty(*ptep)) { |
| 921 | ret = 1; | 1030 | ret = 1; |
| 922 | if (shift) | 1031 | if (shift) |
| 923 | ret = 1 << (shift - PAGE_SHIFT); | 1032 | ret = 1 << (shift - PAGE_SHIFT); |
| 924 | kvmppc_radix_update_pte(kvm, ptep, _PAGE_DIRTY, 0, | 1033 | spin_lock(&kvm->mmu_lock); |
| 925 | gpa, shift); | 1034 | old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_DIRTY, 0, |
| 1035 | gpa, shift); | ||
| 926 | kvmppc_radix_tlbie_page(kvm, gpa, shift, kvm->arch.lpid); | 1036 | kvmppc_radix_tlbie_page(kvm, gpa, shift, kvm->arch.lpid); |
| 1037 | /* Also clear bit in ptes in shadow pgtable for nested guests */ | ||
| 1038 | rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn]; | ||
| 1039 | kvmhv_update_nest_rmap_rc_list(kvm, rmapp, _PAGE_DIRTY, 0, | ||
| 1040 | old & PTE_RPN_MASK, | ||
| 1041 | 1UL << shift); | ||
| 1042 | spin_unlock(&kvm->mmu_lock); | ||
| 927 | } | 1043 | } |
| 928 | return ret; | 1044 | return ret; |
| 929 | } | 1045 | } |
| @@ -953,6 +1069,26 @@ long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm, | |||
| 953 | return 0; | 1069 | return 0; |
| 954 | } | 1070 | } |
| 955 | 1071 | ||
| 1072 | void kvmppc_radix_flush_memslot(struct kvm *kvm, | ||
| 1073 | const struct kvm_memory_slot *memslot) | ||
| 1074 | { | ||
| 1075 | unsigned long n; | ||
| 1076 | pte_t *ptep; | ||
| 1077 | unsigned long gpa; | ||
| 1078 | unsigned int shift; | ||
| 1079 | |||
| 1080 | gpa = memslot->base_gfn << PAGE_SHIFT; | ||
| 1081 | spin_lock(&kvm->mmu_lock); | ||
| 1082 | for (n = memslot->npages; n; --n) { | ||
| 1083 | ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift); | ||
| 1084 | if (ptep && pte_present(*ptep)) | ||
| 1085 | kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot, | ||
| 1086 | kvm->arch.lpid); | ||
| 1087 | gpa += PAGE_SIZE; | ||
| 1088 | } | ||
| 1089 | spin_unlock(&kvm->mmu_lock); | ||
| 1090 | } | ||
| 1091 | |||
| 956 | static void add_rmmu_ap_encoding(struct kvm_ppc_rmmu_info *info, | 1092 | static void add_rmmu_ap_encoding(struct kvm_ppc_rmmu_info *info, |
| 957 | int psize, int *indexp) | 1093 | int psize, int *indexp) |
| 958 | { | 1094 | { |
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index a56f8413758a..5a066fc299e1 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c | |||
| @@ -985,6 +985,10 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) | |||
| 985 | kvmppc_set_gpr(vcpu, 3, 0); | 985 | kvmppc_set_gpr(vcpu, 3, 0); |
| 986 | vcpu->arch.hcall_needed = 0; | 986 | vcpu->arch.hcall_needed = 0; |
| 987 | return -EINTR; | 987 | return -EINTR; |
| 988 | } else if (ret == H_TOO_HARD) { | ||
| 989 | kvmppc_set_gpr(vcpu, 3, 0); | ||
| 990 | vcpu->arch.hcall_needed = 0; | ||
| 991 | return RESUME_HOST; | ||
| 988 | } | 992 | } |
| 989 | break; | 993 | break; |
| 990 | case H_TLB_INVALIDATE: | 994 | case H_TLB_INVALIDATE: |
| @@ -992,7 +996,11 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) | |||
| 992 | if (nesting_enabled(vcpu->kvm)) | 996 | if (nesting_enabled(vcpu->kvm)) |
| 993 | ret = kvmhv_do_nested_tlbie(vcpu); | 997 | ret = kvmhv_do_nested_tlbie(vcpu); |
| 994 | break; | 998 | break; |
| 995 | 999 | case H_COPY_TOFROM_GUEST: | |
| 1000 | ret = H_FUNCTION; | ||
| 1001 | if (nesting_enabled(vcpu->kvm)) | ||
| 1002 | ret = kvmhv_copy_tofrom_guest_nested(vcpu); | ||
| 1003 | break; | ||
| 996 | default: | 1004 | default: |
| 997 | return RESUME_HOST; | 1005 | return RESUME_HOST; |
| 998 | } | 1006 | } |
| @@ -1336,7 +1344,7 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu, | |||
| 1336 | return r; | 1344 | return r; |
| 1337 | } | 1345 | } |
| 1338 | 1346 | ||
| 1339 | static int kvmppc_handle_nested_exit(struct kvm_vcpu *vcpu) | 1347 | static int kvmppc_handle_nested_exit(struct kvm_run *run, struct kvm_vcpu *vcpu) |
| 1340 | { | 1348 | { |
| 1341 | int r; | 1349 | int r; |
| 1342 | int srcu_idx; | 1350 | int srcu_idx; |
| @@ -1394,7 +1402,7 @@ static int kvmppc_handle_nested_exit(struct kvm_vcpu *vcpu) | |||
| 1394 | */ | 1402 | */ |
| 1395 | case BOOK3S_INTERRUPT_H_DATA_STORAGE: | 1403 | case BOOK3S_INTERRUPT_H_DATA_STORAGE: |
| 1396 | srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); | 1404 | srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); |
| 1397 | r = kvmhv_nested_page_fault(vcpu); | 1405 | r = kvmhv_nested_page_fault(run, vcpu); |
| 1398 | srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx); | 1406 | srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx); |
| 1399 | break; | 1407 | break; |
| 1400 | case BOOK3S_INTERRUPT_H_INST_STORAGE: | 1408 | case BOOK3S_INTERRUPT_H_INST_STORAGE: |
| @@ -1404,7 +1412,7 @@ static int kvmppc_handle_nested_exit(struct kvm_vcpu *vcpu) | |||
| 1404 | if (vcpu->arch.shregs.msr & HSRR1_HISI_WRITE) | 1412 | if (vcpu->arch.shregs.msr & HSRR1_HISI_WRITE) |
| 1405 | vcpu->arch.fault_dsisr |= DSISR_ISSTORE; | 1413 | vcpu->arch.fault_dsisr |= DSISR_ISSTORE; |
| 1406 | srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); | 1414 | srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); |
| 1407 | r = kvmhv_nested_page_fault(vcpu); | 1415 | r = kvmhv_nested_page_fault(run, vcpu); |
| 1408 | srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx); | 1416 | srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx); |
| 1409 | break; | 1417 | break; |
| 1410 | 1418 | ||
| @@ -4059,7 +4067,7 @@ int kvmhv_run_single_vcpu(struct kvm_run *kvm_run, | |||
| 4059 | if (!nested) | 4067 | if (!nested) |
| 4060 | r = kvmppc_handle_exit_hv(kvm_run, vcpu, current); | 4068 | r = kvmppc_handle_exit_hv(kvm_run, vcpu, current); |
| 4061 | else | 4069 | else |
| 4062 | r = kvmppc_handle_nested_exit(vcpu); | 4070 | r = kvmppc_handle_nested_exit(kvm_run, vcpu); |
| 4063 | } | 4071 | } |
| 4064 | vcpu->arch.ret = r; | 4072 | vcpu->arch.ret = r; |
| 4065 | 4073 | ||
| @@ -4371,7 +4379,8 @@ static int kvmppc_core_prepare_memory_region_hv(struct kvm *kvm, | |||
| 4371 | static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm, | 4379 | static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm, |
| 4372 | const struct kvm_userspace_memory_region *mem, | 4380 | const struct kvm_userspace_memory_region *mem, |
| 4373 | const struct kvm_memory_slot *old, | 4381 | const struct kvm_memory_slot *old, |
| 4374 | const struct kvm_memory_slot *new) | 4382 | const struct kvm_memory_slot *new, |
| 4383 | enum kvm_mr_change change) | ||
| 4375 | { | 4384 | { |
| 4376 | unsigned long npages = mem->memory_size >> PAGE_SHIFT; | 4385 | unsigned long npages = mem->memory_size >> PAGE_SHIFT; |
| 4377 | 4386 | ||
| @@ -4383,6 +4392,23 @@ static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm, | |||
| 4383 | */ | 4392 | */ |
| 4384 | if (npages) | 4393 | if (npages) |
| 4385 | atomic64_inc(&kvm->arch.mmio_update); | 4394 | atomic64_inc(&kvm->arch.mmio_update); |
| 4395 | |||
| 4396 | /* | ||
| 4397 | * For change == KVM_MR_MOVE or KVM_MR_DELETE, higher levels | ||
| 4398 | * have already called kvm_arch_flush_shadow_memslot() to | ||
| 4399 | * flush shadow mappings. For KVM_MR_CREATE we have no | ||
| 4400 | * previous mappings. So the only case to handle is | ||
| 4401 | * KVM_MR_FLAGS_ONLY when the KVM_MEM_LOG_DIRTY_PAGES bit | ||
| 4402 | * has been changed. | ||
| 4403 | * For radix guests, we flush on setting KVM_MEM_LOG_DIRTY_PAGES | ||
| 4404 | * to get rid of any THP PTEs in the partition-scoped page tables | ||
| 4405 | * so we can track dirtiness at the page level; we flush when | ||
| 4406 | * clearing KVM_MEM_LOG_DIRTY_PAGES so that we can go back to | ||
| 4407 | * using THP PTEs. | ||
| 4408 | */ | ||
| 4409 | if (change == KVM_MR_FLAGS_ONLY && kvm_is_radix(kvm) && | ||
| 4410 | ((new->flags ^ old->flags) & KVM_MEM_LOG_DIRTY_PAGES)) | ||
| 4411 | kvmppc_radix_flush_memslot(kvm, old); | ||
| 4386 | } | 4412 | } |
| 4387 | 4413 | ||
| 4388 | /* | 4414 | /* |
| @@ -4532,12 +4558,15 @@ int kvmppc_switch_mmu_to_hpt(struct kvm *kvm) | |||
| 4532 | { | 4558 | { |
| 4533 | if (nesting_enabled(kvm)) | 4559 | if (nesting_enabled(kvm)) |
| 4534 | kvmhv_release_all_nested(kvm); | 4560 | kvmhv_release_all_nested(kvm); |
| 4561 | kvmppc_rmap_reset(kvm); | ||
| 4562 | kvm->arch.process_table = 0; | ||
| 4563 | /* Mutual exclusion with kvm_unmap_hva_range etc. */ | ||
| 4564 | spin_lock(&kvm->mmu_lock); | ||
| 4565 | kvm->arch.radix = 0; | ||
| 4566 | spin_unlock(&kvm->mmu_lock); | ||
| 4535 | kvmppc_free_radix(kvm); | 4567 | kvmppc_free_radix(kvm); |
| 4536 | kvmppc_update_lpcr(kvm, LPCR_VPM1, | 4568 | kvmppc_update_lpcr(kvm, LPCR_VPM1, |
| 4537 | LPCR_VPM1 | LPCR_UPRT | LPCR_GTSE | LPCR_HR); | 4569 | LPCR_VPM1 | LPCR_UPRT | LPCR_GTSE | LPCR_HR); |
| 4538 | kvmppc_rmap_reset(kvm); | ||
| 4539 | kvm->arch.radix = 0; | ||
| 4540 | kvm->arch.process_table = 0; | ||
| 4541 | return 0; | 4570 | return 0; |
| 4542 | } | 4571 | } |
| 4543 | 4572 | ||
| @@ -4549,12 +4578,14 @@ int kvmppc_switch_mmu_to_radix(struct kvm *kvm) | |||
| 4549 | err = kvmppc_init_vm_radix(kvm); | 4578 | err = kvmppc_init_vm_radix(kvm); |
| 4550 | if (err) | 4579 | if (err) |
| 4551 | return err; | 4580 | return err; |
| 4552 | 4581 | kvmppc_rmap_reset(kvm); | |
| 4582 | /* Mutual exclusion with kvm_unmap_hva_range etc. */ | ||
| 4583 | spin_lock(&kvm->mmu_lock); | ||
| 4584 | kvm->arch.radix = 1; | ||
| 4585 | spin_unlock(&kvm->mmu_lock); | ||
| 4553 | kvmppc_free_hpt(&kvm->arch.hpt); | 4586 | kvmppc_free_hpt(&kvm->arch.hpt); |
| 4554 | kvmppc_update_lpcr(kvm, LPCR_UPRT | LPCR_GTSE | LPCR_HR, | 4587 | kvmppc_update_lpcr(kvm, LPCR_UPRT | LPCR_GTSE | LPCR_HR, |
| 4555 | LPCR_VPM1 | LPCR_UPRT | LPCR_GTSE | LPCR_HR); | 4588 | LPCR_VPM1 | LPCR_UPRT | LPCR_GTSE | LPCR_HR); |
| 4556 | kvmppc_rmap_reset(kvm); | ||
| 4557 | kvm->arch.radix = 1; | ||
| 4558 | return 0; | 4589 | return 0; |
| 4559 | } | 4590 | } |
| 4560 | 4591 | ||
| @@ -5214,6 +5245,44 @@ static int kvmhv_enable_nested(struct kvm *kvm) | |||
| 5214 | return 0; | 5245 | return 0; |
| 5215 | } | 5246 | } |
| 5216 | 5247 | ||
| 5248 | static int kvmhv_load_from_eaddr(struct kvm_vcpu *vcpu, ulong *eaddr, void *ptr, | ||
| 5249 | int size) | ||
| 5250 | { | ||
| 5251 | int rc = -EINVAL; | ||
| 5252 | |||
| 5253 | if (kvmhv_vcpu_is_radix(vcpu)) { | ||
| 5254 | rc = kvmhv_copy_from_guest_radix(vcpu, *eaddr, ptr, size); | ||
| 5255 | |||
| 5256 | if (rc > 0) | ||
| 5257 | rc = -EINVAL; | ||
| 5258 | } | ||
| 5259 | |||
| 5260 | /* For now quadrants are the only way to access nested guest memory */ | ||
| 5261 | if (rc && vcpu->arch.nested) | ||
| 5262 | rc = -EAGAIN; | ||
| 5263 | |||
| 5264 | return rc; | ||
| 5265 | } | ||
| 5266 | |||
| 5267 | static int kvmhv_store_to_eaddr(struct kvm_vcpu *vcpu, ulong *eaddr, void *ptr, | ||
| 5268 | int size) | ||
| 5269 | { | ||
| 5270 | int rc = -EINVAL; | ||
| 5271 | |||
| 5272 | if (kvmhv_vcpu_is_radix(vcpu)) { | ||
| 5273 | rc = kvmhv_copy_to_guest_radix(vcpu, *eaddr, ptr, size); | ||
| 5274 | |||
| 5275 | if (rc > 0) | ||
| 5276 | rc = -EINVAL; | ||
| 5277 | } | ||
| 5278 | |||
| 5279 | /* For now quadrants are the only way to access nested guest memory */ | ||
| 5280 | if (rc && vcpu->arch.nested) | ||
| 5281 | rc = -EAGAIN; | ||
| 5282 | |||
| 5283 | return rc; | ||
| 5284 | } | ||
| 5285 | |||
| 5217 | static struct kvmppc_ops kvm_ops_hv = { | 5286 | static struct kvmppc_ops kvm_ops_hv = { |
| 5218 | .get_sregs = kvm_arch_vcpu_ioctl_get_sregs_hv, | 5287 | .get_sregs = kvm_arch_vcpu_ioctl_get_sregs_hv, |
| 5219 | .set_sregs = kvm_arch_vcpu_ioctl_set_sregs_hv, | 5288 | .set_sregs = kvm_arch_vcpu_ioctl_set_sregs_hv, |
| @@ -5254,6 +5323,8 @@ static struct kvmppc_ops kvm_ops_hv = { | |||
| 5254 | .get_rmmu_info = kvmhv_get_rmmu_info, | 5323 | .get_rmmu_info = kvmhv_get_rmmu_info, |
| 5255 | .set_smt_mode = kvmhv_set_smt_mode, | 5324 | .set_smt_mode = kvmhv_set_smt_mode, |
| 5256 | .enable_nested = kvmhv_enable_nested, | 5325 | .enable_nested = kvmhv_enable_nested, |
| 5326 | .load_from_eaddr = kvmhv_load_from_eaddr, | ||
| 5327 | .store_to_eaddr = kvmhv_store_to_eaddr, | ||
| 5257 | }; | 5328 | }; |
| 5258 | 5329 | ||
| 5259 | static int kvm_init_subcore_bitmap(void) | 5330 | static int kvm_init_subcore_bitmap(void) |
diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c index 401d2ecbebc5..735e0ac6f5b2 100644 --- a/arch/powerpc/kvm/book3s_hv_nested.c +++ b/arch/powerpc/kvm/book3s_hv_nested.c | |||
| @@ -195,6 +195,26 @@ void kvmhv_restore_hv_return_state(struct kvm_vcpu *vcpu, | |||
| 195 | vcpu->arch.ppr = hr->ppr; | 195 | vcpu->arch.ppr = hr->ppr; |
| 196 | } | 196 | } |
| 197 | 197 | ||
| 198 | static void kvmhv_nested_mmio_needed(struct kvm_vcpu *vcpu, u64 regs_ptr) | ||
| 199 | { | ||
| 200 | /* No need to reflect the page fault to L1, we've handled it */ | ||
| 201 | vcpu->arch.trap = 0; | ||
| 202 | |||
| 203 | /* | ||
| 204 | * Since the L2 gprs have already been written back into L1 memory when | ||
| 205 | * we complete the mmio, store the L1 memory location of the L2 gpr | ||
| 206 | * being loaded into by the mmio so that the loaded value can be | ||
| 207 | * written there in kvmppc_complete_mmio_load() | ||
| 208 | */ | ||
| 209 | if (((vcpu->arch.io_gpr & KVM_MMIO_REG_EXT_MASK) == KVM_MMIO_REG_GPR) | ||
| 210 | && (vcpu->mmio_is_write == 0)) { | ||
| 211 | vcpu->arch.nested_io_gpr = (gpa_t) regs_ptr + | ||
| 212 | offsetof(struct pt_regs, | ||
| 213 | gpr[vcpu->arch.io_gpr]); | ||
| 214 | vcpu->arch.io_gpr = KVM_MMIO_REG_NESTED_GPR; | ||
| 215 | } | ||
| 216 | } | ||
| 217 | |||
| 198 | long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu) | 218 | long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu) |
| 199 | { | 219 | { |
| 200 | long int err, r; | 220 | long int err, r; |
| @@ -316,6 +336,11 @@ long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu) | |||
| 316 | if (r == -EINTR) | 336 | if (r == -EINTR) |
| 317 | return H_INTERRUPT; | 337 | return H_INTERRUPT; |
| 318 | 338 | ||
| 339 | if (vcpu->mmio_needed) { | ||
| 340 | kvmhv_nested_mmio_needed(vcpu, regs_ptr); | ||
| 341 | return H_TOO_HARD; | ||
| 342 | } | ||
| 343 | |||
| 319 | return vcpu->arch.trap; | 344 | return vcpu->arch.trap; |
| 320 | } | 345 | } |
| 321 | 346 | ||
| @@ -437,6 +462,81 @@ long kvmhv_set_partition_table(struct kvm_vcpu *vcpu) | |||
| 437 | } | 462 | } |
| 438 | 463 | ||
| 439 | /* | 464 | /* |
| 465 | * Handle the H_COPY_TOFROM_GUEST hcall. | ||
| 466 | * r4 = L1 lpid of nested guest | ||
| 467 | * r5 = pid | ||
| 468 | * r6 = eaddr to access | ||
| 469 | * r7 = to buffer (L1 gpa) | ||
| 470 | * r8 = from buffer (L1 gpa) | ||
| 471 | * r9 = n bytes to copy | ||
| 472 | */ | ||
| 473 | long kvmhv_copy_tofrom_guest_nested(struct kvm_vcpu *vcpu) | ||
| 474 | { | ||
| 475 | struct kvm_nested_guest *gp; | ||
| 476 | int l1_lpid = kvmppc_get_gpr(vcpu, 4); | ||
| 477 | int pid = kvmppc_get_gpr(vcpu, 5); | ||
| 478 | gva_t eaddr = kvmppc_get_gpr(vcpu, 6); | ||
| 479 | gpa_t gp_to = (gpa_t) kvmppc_get_gpr(vcpu, 7); | ||
| 480 | gpa_t gp_from = (gpa_t) kvmppc_get_gpr(vcpu, 8); | ||
| 481 | void *buf; | ||
| 482 | unsigned long n = kvmppc_get_gpr(vcpu, 9); | ||
| 483 | bool is_load = !!gp_to; | ||
| 484 | long rc; | ||
| 485 | |||
| 486 | if (gp_to && gp_from) /* One must be NULL to determine the direction */ | ||
| 487 | return H_PARAMETER; | ||
| 488 | |||
| 489 | if (eaddr & (0xFFFUL << 52)) | ||
| 490 | return H_PARAMETER; | ||
| 491 | |||
| 492 | buf = kzalloc(n, GFP_KERNEL); | ||
| 493 | if (!buf) | ||
| 494 | return H_NO_MEM; | ||
| 495 | |||
| 496 | gp = kvmhv_get_nested(vcpu->kvm, l1_lpid, false); | ||
| 497 | if (!gp) { | ||
| 498 | rc = H_PARAMETER; | ||
| 499 | goto out_free; | ||
| 500 | } | ||
| 501 | |||
| 502 | mutex_lock(&gp->tlb_lock); | ||
| 503 | |||
| 504 | if (is_load) { | ||
| 505 | /* Load from the nested guest into our buffer */ | ||
| 506 | rc = __kvmhv_copy_tofrom_guest_radix(gp->shadow_lpid, pid, | ||
| 507 | eaddr, buf, NULL, n); | ||
| 508 | if (rc) | ||
| 509 | goto not_found; | ||
| 510 | |||
| 511 | /* Write what was loaded into our buffer back to the L1 guest */ | ||
| 512 | rc = kvm_vcpu_write_guest(vcpu, gp_to, buf, n); | ||
| 513 | if (rc) | ||
| 514 | goto not_found; | ||
| 515 | } else { | ||
| 516 | /* Load the data to be stored from the L1 guest into our buf */ | ||
| 517 | rc = kvm_vcpu_read_guest(vcpu, gp_from, buf, n); | ||
| 518 | if (rc) | ||
| 519 | goto not_found; | ||
| 520 | |||
| 521 | /* Store from our buffer into the nested guest */ | ||
| 522 | rc = __kvmhv_copy_tofrom_guest_radix(gp->shadow_lpid, pid, | ||
| 523 | eaddr, NULL, buf, n); | ||
| 524 | if (rc) | ||
| 525 | goto not_found; | ||
| 526 | } | ||
| 527 | |||
| 528 | out_unlock: | ||
| 529 | mutex_unlock(&gp->tlb_lock); | ||
| 530 | kvmhv_put_nested(gp); | ||
| 531 | out_free: | ||
| 532 | kfree(buf); | ||
| 533 | return rc; | ||
| 534 | not_found: | ||
| 535 | rc = H_NOT_FOUND; | ||
| 536 | goto out_unlock; | ||
| 537 | } | ||
| 538 | |||
| 539 | /* | ||
| 440 | * Reload the partition table entry for a guest. | 540 | * Reload the partition table entry for a guest. |
| 441 | * Caller must hold gp->tlb_lock. | 541 | * Caller must hold gp->tlb_lock. |
| 442 | */ | 542 | */ |
| @@ -480,6 +580,7 @@ struct kvm_nested_guest *kvmhv_alloc_nested(struct kvm *kvm, unsigned int lpid) | |||
| 480 | if (shadow_lpid < 0) | 580 | if (shadow_lpid < 0) |
| 481 | goto out_free2; | 581 | goto out_free2; |
| 482 | gp->shadow_lpid = shadow_lpid; | 582 | gp->shadow_lpid = shadow_lpid; |
| 583 | gp->radix = 1; | ||
| 483 | 584 | ||
| 484 | memset(gp->prev_cpu, -1, sizeof(gp->prev_cpu)); | 585 | memset(gp->prev_cpu, -1, sizeof(gp->prev_cpu)); |
| 485 | 586 | ||
| @@ -687,6 +788,57 @@ void kvmhv_insert_nest_rmap(struct kvm *kvm, unsigned long *rmapp, | |||
| 687 | *n_rmap = NULL; | 788 | *n_rmap = NULL; |
| 688 | } | 789 | } |
| 689 | 790 | ||
| 791 | static void kvmhv_update_nest_rmap_rc(struct kvm *kvm, u64 n_rmap, | ||
| 792 | unsigned long clr, unsigned long set, | ||
| 793 | unsigned long hpa, unsigned long mask) | ||
| 794 | { | ||
| 795 | struct kvm_nested_guest *gp; | ||
| 796 | unsigned long gpa; | ||
| 797 | unsigned int shift, lpid; | ||
| 798 | pte_t *ptep; | ||
| 799 | |||
| 800 | gpa = n_rmap & RMAP_NESTED_GPA_MASK; | ||
| 801 | lpid = (n_rmap & RMAP_NESTED_LPID_MASK) >> RMAP_NESTED_LPID_SHIFT; | ||
| 802 | gp = kvmhv_find_nested(kvm, lpid); | ||
| 803 | if (!gp) | ||
| 804 | return; | ||
| 805 | |||
| 806 | /* Find the pte */ | ||
| 807 | ptep = __find_linux_pte(gp->shadow_pgtable, gpa, NULL, &shift); | ||
| 808 | /* | ||
| 809 | * If the pte is present and the pfn is still the same, update the pte. | ||
| 810 | * If the pfn has changed then this is a stale rmap entry, the nested | ||
| 811 | * gpa actually points somewhere else now, and there is nothing to do. | ||
| 812 | * XXX A future optimisation would be to remove the rmap entry here. | ||
| 813 | */ | ||
| 814 | if (ptep && pte_present(*ptep) && ((pte_val(*ptep) & mask) == hpa)) { | ||
| 815 | __radix_pte_update(ptep, clr, set); | ||
| 816 | kvmppc_radix_tlbie_page(kvm, gpa, shift, lpid); | ||
| 817 | } | ||
| 818 | } | ||
| 819 | |||
| 820 | /* | ||
| 821 | * For a given list of rmap entries, update the rc bits in all ptes in shadow | ||
| 822 | * page tables for nested guests which are referenced by the rmap list. | ||
| 823 | */ | ||
| 824 | void kvmhv_update_nest_rmap_rc_list(struct kvm *kvm, unsigned long *rmapp, | ||
| 825 | unsigned long clr, unsigned long set, | ||
| 826 | unsigned long hpa, unsigned long nbytes) | ||
| 827 | { | ||
| 828 | struct llist_node *entry = ((struct llist_head *) rmapp)->first; | ||
| 829 | struct rmap_nested *cursor; | ||
| 830 | unsigned long rmap, mask; | ||
| 831 | |||
| 832 | if ((clr | set) & ~(_PAGE_DIRTY | _PAGE_ACCESSED)) | ||
| 833 | return; | ||
| 834 | |||
| 835 | mask = PTE_RPN_MASK & ~(nbytes - 1); | ||
| 836 | hpa &= mask; | ||
| 837 | |||
| 838 | for_each_nest_rmap_safe(cursor, entry, &rmap) | ||
| 839 | kvmhv_update_nest_rmap_rc(kvm, rmap, clr, set, hpa, mask); | ||
| 840 | } | ||
| 841 | |||
| 690 | static void kvmhv_remove_nest_rmap(struct kvm *kvm, u64 n_rmap, | 842 | static void kvmhv_remove_nest_rmap(struct kvm *kvm, u64 n_rmap, |
| 691 | unsigned long hpa, unsigned long mask) | 843 | unsigned long hpa, unsigned long mask) |
| 692 | { | 844 | { |
| @@ -723,7 +875,7 @@ static void kvmhv_remove_nest_rmap_list(struct kvm *kvm, unsigned long *rmapp, | |||
| 723 | 875 | ||
| 724 | /* called with kvm->mmu_lock held */ | 876 | /* called with kvm->mmu_lock held */ |
| 725 | void kvmhv_remove_nest_rmap_range(struct kvm *kvm, | 877 | void kvmhv_remove_nest_rmap_range(struct kvm *kvm, |
| 726 | struct kvm_memory_slot *memslot, | 878 | const struct kvm_memory_slot *memslot, |
| 727 | unsigned long gpa, unsigned long hpa, | 879 | unsigned long gpa, unsigned long hpa, |
| 728 | unsigned long nbytes) | 880 | unsigned long nbytes) |
| 729 | { | 881 | { |
| @@ -1049,7 +1201,7 @@ static long kvmhv_handle_nested_set_rc(struct kvm_vcpu *vcpu, | |||
| 1049 | struct kvm *kvm = vcpu->kvm; | 1201 | struct kvm *kvm = vcpu->kvm; |
| 1050 | bool writing = !!(dsisr & DSISR_ISSTORE); | 1202 | bool writing = !!(dsisr & DSISR_ISSTORE); |
| 1051 | u64 pgflags; | 1203 | u64 pgflags; |
| 1052 | bool ret; | 1204 | long ret; |
| 1053 | 1205 | ||
| 1054 | /* Are the rc bits set in the L1 partition scoped pte? */ | 1206 | /* Are the rc bits set in the L1 partition scoped pte? */ |
| 1055 | pgflags = _PAGE_ACCESSED; | 1207 | pgflags = _PAGE_ACCESSED; |
| @@ -1062,16 +1214,22 @@ static long kvmhv_handle_nested_set_rc(struct kvm_vcpu *vcpu, | |||
| 1062 | /* Set the rc bit in the pte of our (L0) pgtable for the L1 guest */ | 1214 | /* Set the rc bit in the pte of our (L0) pgtable for the L1 guest */ |
| 1063 | ret = kvmppc_hv_handle_set_rc(kvm, kvm->arch.pgtable, writing, | 1215 | ret = kvmppc_hv_handle_set_rc(kvm, kvm->arch.pgtable, writing, |
| 1064 | gpte.raddr, kvm->arch.lpid); | 1216 | gpte.raddr, kvm->arch.lpid); |
| 1065 | spin_unlock(&kvm->mmu_lock); | 1217 | if (!ret) { |
| 1066 | if (!ret) | 1218 | ret = -EINVAL; |
| 1067 | return -EINVAL; | 1219 | goto out_unlock; |
| 1220 | } | ||
| 1068 | 1221 | ||
| 1069 | /* Set the rc bit in the pte of the shadow_pgtable for the nest guest */ | 1222 | /* Set the rc bit in the pte of the shadow_pgtable for the nest guest */ |
| 1070 | ret = kvmppc_hv_handle_set_rc(kvm, gp->shadow_pgtable, writing, n_gpa, | 1223 | ret = kvmppc_hv_handle_set_rc(kvm, gp->shadow_pgtable, writing, n_gpa, |
| 1071 | gp->shadow_lpid); | 1224 | gp->shadow_lpid); |
| 1072 | if (!ret) | 1225 | if (!ret) |
| 1073 | return -EINVAL; | 1226 | ret = -EINVAL; |
| 1074 | return 0; | 1227 | else |
| 1228 | ret = 0; | ||
| 1229 | |||
| 1230 | out_unlock: | ||
| 1231 | spin_unlock(&kvm->mmu_lock); | ||
| 1232 | return ret; | ||
| 1075 | } | 1233 | } |
| 1076 | 1234 | ||
| 1077 | static inline int kvmppc_radix_level_to_shift(int level) | 1235 | static inline int kvmppc_radix_level_to_shift(int level) |
| @@ -1099,7 +1257,8 @@ static inline int kvmppc_radix_shift_to_level(int shift) | |||
| 1099 | } | 1257 | } |
| 1100 | 1258 | ||
| 1101 | /* called with gp->tlb_lock held */ | 1259 | /* called with gp->tlb_lock held */ |
| 1102 | static long int __kvmhv_nested_page_fault(struct kvm_vcpu *vcpu, | 1260 | static long int __kvmhv_nested_page_fault(struct kvm_run *run, |
| 1261 | struct kvm_vcpu *vcpu, | ||
| 1103 | struct kvm_nested_guest *gp) | 1262 | struct kvm_nested_guest *gp) |
| 1104 | { | 1263 | { |
| 1105 | struct kvm *kvm = vcpu->kvm; | 1264 | struct kvm *kvm = vcpu->kvm; |
| @@ -1180,9 +1339,9 @@ static long int __kvmhv_nested_page_fault(struct kvm_vcpu *vcpu, | |||
| 1180 | kvmppc_core_queue_data_storage(vcpu, ea, dsisr); | 1339 | kvmppc_core_queue_data_storage(vcpu, ea, dsisr); |
| 1181 | return RESUME_GUEST; | 1340 | return RESUME_GUEST; |
| 1182 | } | 1341 | } |
| 1183 | /* passthrough of emulated MMIO case... */ | 1342 | |
| 1184 | pr_err("emulated MMIO passthrough?\n"); | 1343 | /* passthrough of emulated MMIO case */ |
| 1185 | return -EINVAL; | 1344 | return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea, writing); |
| 1186 | } | 1345 | } |
| 1187 | if (memslot->flags & KVM_MEM_READONLY) { | 1346 | if (memslot->flags & KVM_MEM_READONLY) { |
| 1188 | if (writing) { | 1347 | if (writing) { |
| @@ -1220,6 +1379,8 @@ static long int __kvmhv_nested_page_fault(struct kvm_vcpu *vcpu, | |||
| 1220 | return ret; | 1379 | return ret; |
| 1221 | shift = kvmppc_radix_level_to_shift(level); | 1380 | shift = kvmppc_radix_level_to_shift(level); |
| 1222 | } | 1381 | } |
| 1382 | /* Align gfn to the start of the page */ | ||
| 1383 | gfn = (gpa & ~((1UL << shift) - 1)) >> PAGE_SHIFT; | ||
| 1223 | 1384 | ||
| 1224 | /* 3. Compute the pte we need to insert for nest_gpa -> host r_addr */ | 1385 | /* 3. Compute the pte we need to insert for nest_gpa -> host r_addr */ |
| 1225 | 1386 | ||
| @@ -1227,6 +1388,9 @@ static long int __kvmhv_nested_page_fault(struct kvm_vcpu *vcpu, | |||
| 1227 | perm |= gpte.may_read ? 0UL : _PAGE_READ; | 1388 | perm |= gpte.may_read ? 0UL : _PAGE_READ; |
| 1228 | perm |= gpte.may_write ? 0UL : _PAGE_WRITE; | 1389 | perm |= gpte.may_write ? 0UL : _PAGE_WRITE; |
| 1229 | perm |= gpte.may_execute ? 0UL : _PAGE_EXEC; | 1390 | perm |= gpte.may_execute ? 0UL : _PAGE_EXEC; |
| 1391 | /* Only set accessed/dirty (rc) bits if set in host and l1 guest ptes */ | ||
| 1392 | perm |= (gpte.rc & _PAGE_ACCESSED) ? 0UL : _PAGE_ACCESSED; | ||
| 1393 | perm |= ((gpte.rc & _PAGE_DIRTY) && writing) ? 0UL : _PAGE_DIRTY; | ||
| 1230 | pte = __pte(pte_val(pte) & ~perm); | 1394 | pte = __pte(pte_val(pte) & ~perm); |
| 1231 | 1395 | ||
| 1232 | /* What size pte can we insert? */ | 1396 | /* What size pte can we insert? */ |
| @@ -1264,13 +1428,13 @@ static long int __kvmhv_nested_page_fault(struct kvm_vcpu *vcpu, | |||
| 1264 | return RESUME_GUEST; | 1428 | return RESUME_GUEST; |
| 1265 | } | 1429 | } |
| 1266 | 1430 | ||
| 1267 | long int kvmhv_nested_page_fault(struct kvm_vcpu *vcpu) | 1431 | long int kvmhv_nested_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu) |
| 1268 | { | 1432 | { |
| 1269 | struct kvm_nested_guest *gp = vcpu->arch.nested; | 1433 | struct kvm_nested_guest *gp = vcpu->arch.nested; |
| 1270 | long int ret; | 1434 | long int ret; |
| 1271 | 1435 | ||
| 1272 | mutex_lock(&gp->tlb_lock); | 1436 | mutex_lock(&gp->tlb_lock); |
| 1273 | ret = __kvmhv_nested_page_fault(vcpu, gp); | 1437 | ret = __kvmhv_nested_page_fault(run, vcpu, gp); |
| 1274 | mutex_unlock(&gp->tlb_lock); | 1438 | mutex_unlock(&gp->tlb_lock); |
| 1275 | return ret; | 1439 | return ret; |
| 1276 | } | 1440 | } |
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index a67cf1cdeda4..3b3791ed74a6 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c | |||
| @@ -107,7 +107,7 @@ void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev, | |||
| 107 | EXPORT_SYMBOL_GPL(kvmppc_add_revmap_chain); | 107 | EXPORT_SYMBOL_GPL(kvmppc_add_revmap_chain); |
| 108 | 108 | ||
| 109 | /* Update the dirty bitmap of a memslot */ | 109 | /* Update the dirty bitmap of a memslot */ |
| 110 | void kvmppc_update_dirty_map(struct kvm_memory_slot *memslot, | 110 | void kvmppc_update_dirty_map(const struct kvm_memory_slot *memslot, |
| 111 | unsigned long gfn, unsigned long psize) | 111 | unsigned long gfn, unsigned long psize) |
| 112 | { | 112 | { |
| 113 | unsigned long npages; | 113 | unsigned long npages; |
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index 4efd65d9e828..811a3c2fb0e9 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c | |||
| @@ -587,6 +587,7 @@ void kvmppc_set_pvr_pr(struct kvm_vcpu *vcpu, u32 pvr) | |||
| 587 | case PVR_POWER8: | 587 | case PVR_POWER8: |
| 588 | case PVR_POWER8E: | 588 | case PVR_POWER8E: |
| 589 | case PVR_POWER8NVL: | 589 | case PVR_POWER8NVL: |
| 590 | case PVR_POWER9: | ||
| 590 | vcpu->arch.hflags |= BOOK3S_HFLAG_MULTI_PGSIZE | | 591 | vcpu->arch.hflags |= BOOK3S_HFLAG_MULTI_PGSIZE | |
| 591 | BOOK3S_HFLAG_NEW_TLBIE; | 592 | BOOK3S_HFLAG_NEW_TLBIE; |
| 592 | break; | 593 | break; |
| @@ -1913,7 +1914,8 @@ static int kvmppc_core_prepare_memory_region_pr(struct kvm *kvm, | |||
| 1913 | static void kvmppc_core_commit_memory_region_pr(struct kvm *kvm, | 1914 | static void kvmppc_core_commit_memory_region_pr(struct kvm *kvm, |
| 1914 | const struct kvm_userspace_memory_region *mem, | 1915 | const struct kvm_userspace_memory_region *mem, |
| 1915 | const struct kvm_memory_slot *old, | 1916 | const struct kvm_memory_slot *old, |
| 1916 | const struct kvm_memory_slot *new) | 1917 | const struct kvm_memory_slot *new, |
| 1918 | enum kvm_mr_change change) | ||
| 1917 | { | 1919 | { |
| 1918 | return; | 1920 | return; |
| 1919 | } | 1921 | } |
diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c index b0b2bfc2ff51..f27ee57ab46e 100644 --- a/arch/powerpc/kvm/book3s_xics.c +++ b/arch/powerpc/kvm/book3s_xics.c | |||
| @@ -1015,17 +1015,7 @@ static int xics_debug_show(struct seq_file *m, void *private) | |||
| 1015 | return 0; | 1015 | return 0; |
| 1016 | } | 1016 | } |
| 1017 | 1017 | ||
| 1018 | static int xics_debug_open(struct inode *inode, struct file *file) | 1018 | DEFINE_SHOW_ATTRIBUTE(xics_debug); |
| 1019 | { | ||
| 1020 | return single_open(file, xics_debug_show, inode->i_private); | ||
| 1021 | } | ||
| 1022 | |||
| 1023 | static const struct file_operations xics_debug_fops = { | ||
| 1024 | .open = xics_debug_open, | ||
| 1025 | .read = seq_read, | ||
| 1026 | .llseek = seq_lseek, | ||
| 1027 | .release = single_release, | ||
| 1028 | }; | ||
| 1029 | 1019 | ||
| 1030 | static void xics_debugfs_init(struct kvmppc_xics *xics) | 1020 | static void xics_debugfs_init(struct kvmppc_xics *xics) |
| 1031 | { | 1021 | { |
diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c index ad4a370703d3..f78d002f0fe0 100644 --- a/arch/powerpc/kvm/book3s_xive.c +++ b/arch/powerpc/kvm/book3s_xive.c | |||
| @@ -1968,17 +1968,7 @@ static int xive_debug_show(struct seq_file *m, void *private) | |||
| 1968 | return 0; | 1968 | return 0; |
| 1969 | } | 1969 | } |
| 1970 | 1970 | ||
| 1971 | static int xive_debug_open(struct inode *inode, struct file *file) | 1971 | DEFINE_SHOW_ATTRIBUTE(xive_debug); |
| 1972 | { | ||
| 1973 | return single_open(file, xive_debug_show, inode->i_private); | ||
| 1974 | } | ||
| 1975 | |||
| 1976 | static const struct file_operations xive_debug_fops = { | ||
| 1977 | .open = xive_debug_open, | ||
| 1978 | .read = seq_read, | ||
| 1979 | .llseek = seq_lseek, | ||
| 1980 | .release = single_release, | ||
| 1981 | }; | ||
| 1982 | 1972 | ||
| 1983 | static void xive_debugfs_init(struct kvmppc_xive *xive) | 1973 | static void xive_debugfs_init(struct kvmppc_xive *xive) |
| 1984 | { | 1974 | { |
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index a9ca016da670..dbec4128bb51 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c | |||
| @@ -1833,7 +1833,8 @@ int kvmppc_core_prepare_memory_region(struct kvm *kvm, | |||
| 1833 | void kvmppc_core_commit_memory_region(struct kvm *kvm, | 1833 | void kvmppc_core_commit_memory_region(struct kvm *kvm, |
| 1834 | const struct kvm_userspace_memory_region *mem, | 1834 | const struct kvm_userspace_memory_region *mem, |
| 1835 | const struct kvm_memory_slot *old, | 1835 | const struct kvm_memory_slot *old, |
| 1836 | const struct kvm_memory_slot *new) | 1836 | const struct kvm_memory_slot *new, |
| 1837 | enum kvm_mr_change change) | ||
| 1837 | { | 1838 | { |
| 1838 | } | 1839 | } |
| 1839 | 1840 | ||
diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c index 8f2985e46f6f..c3f312b2bcb3 100644 --- a/arch/powerpc/kvm/e500_mmu_host.c +++ b/arch/powerpc/kvm/e500_mmu_host.c | |||
| @@ -757,10 +757,11 @@ int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) | |||
| 757 | return 0; | 757 | return 0; |
| 758 | } | 758 | } |
| 759 | 759 | ||
| 760 | void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) | 760 | int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) |
| 761 | { | 761 | { |
| 762 | /* The page will get remapped properly on its next fault */ | 762 | /* The page will get remapped properly on its next fault */ |
| 763 | kvm_unmap_hva(kvm, hva); | 763 | kvm_unmap_hva(kvm, hva); |
| 764 | return 0; | ||
| 764 | } | 765 | } |
| 765 | 766 | ||
| 766 | /*****************************************/ | 767 | /*****************************************/ |
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 2869a299c4ed..b90a7d154180 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c | |||
| @@ -331,10 +331,17 @@ int kvmppc_st(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, | |||
| 331 | { | 331 | { |
| 332 | ulong mp_pa = vcpu->arch.magic_page_pa & KVM_PAM & PAGE_MASK; | 332 | ulong mp_pa = vcpu->arch.magic_page_pa & KVM_PAM & PAGE_MASK; |
| 333 | struct kvmppc_pte pte; | 333 | struct kvmppc_pte pte; |
| 334 | int r; | 334 | int r = -EINVAL; |
| 335 | 335 | ||
| 336 | vcpu->stat.st++; | 336 | vcpu->stat.st++; |
| 337 | 337 | ||
| 338 | if (vcpu->kvm->arch.kvm_ops && vcpu->kvm->arch.kvm_ops->store_to_eaddr) | ||
| 339 | r = vcpu->kvm->arch.kvm_ops->store_to_eaddr(vcpu, eaddr, ptr, | ||
| 340 | size); | ||
| 341 | |||
| 342 | if ((!r) || (r == -EAGAIN)) | ||
| 343 | return r; | ||
| 344 | |||
| 338 | r = kvmppc_xlate(vcpu, *eaddr, data ? XLATE_DATA : XLATE_INST, | 345 | r = kvmppc_xlate(vcpu, *eaddr, data ? XLATE_DATA : XLATE_INST, |
| 339 | XLATE_WRITE, &pte); | 346 | XLATE_WRITE, &pte); |
| 340 | if (r < 0) | 347 | if (r < 0) |
| @@ -367,10 +374,17 @@ int kvmppc_ld(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, | |||
| 367 | { | 374 | { |
| 368 | ulong mp_pa = vcpu->arch.magic_page_pa & KVM_PAM & PAGE_MASK; | 375 | ulong mp_pa = vcpu->arch.magic_page_pa & KVM_PAM & PAGE_MASK; |
| 369 | struct kvmppc_pte pte; | 376 | struct kvmppc_pte pte; |
| 370 | int rc; | 377 | int rc = -EINVAL; |
| 371 | 378 | ||
| 372 | vcpu->stat.ld++; | 379 | vcpu->stat.ld++; |
| 373 | 380 | ||
| 381 | if (vcpu->kvm->arch.kvm_ops && vcpu->kvm->arch.kvm_ops->load_from_eaddr) | ||
| 382 | rc = vcpu->kvm->arch.kvm_ops->load_from_eaddr(vcpu, eaddr, ptr, | ||
| 383 | size); | ||
| 384 | |||
| 385 | if ((!rc) || (rc == -EAGAIN)) | ||
| 386 | return rc; | ||
| 387 | |||
| 374 | rc = kvmppc_xlate(vcpu, *eaddr, data ? XLATE_DATA : XLATE_INST, | 388 | rc = kvmppc_xlate(vcpu, *eaddr, data ? XLATE_DATA : XLATE_INST, |
| 375 | XLATE_READ, &pte); | 389 | XLATE_READ, &pte); |
| 376 | if (rc) | 390 | if (rc) |
| @@ -518,7 +532,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) | |||
| 518 | case KVM_CAP_PPC_UNSET_IRQ: | 532 | case KVM_CAP_PPC_UNSET_IRQ: |
| 519 | case KVM_CAP_PPC_IRQ_LEVEL: | 533 | case KVM_CAP_PPC_IRQ_LEVEL: |
| 520 | case KVM_CAP_ENABLE_CAP: | 534 | case KVM_CAP_ENABLE_CAP: |
| 521 | case KVM_CAP_ENABLE_CAP_VM: | ||
| 522 | case KVM_CAP_ONE_REG: | 535 | case KVM_CAP_ONE_REG: |
| 523 | case KVM_CAP_IOEVENTFD: | 536 | case KVM_CAP_IOEVENTFD: |
| 524 | case KVM_CAP_DEVICE_CTRL: | 537 | case KVM_CAP_DEVICE_CTRL: |
| @@ -543,8 +556,11 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) | |||
| 543 | #ifdef CONFIG_PPC_BOOK3S_64 | 556 | #ifdef CONFIG_PPC_BOOK3S_64 |
| 544 | case KVM_CAP_SPAPR_TCE: | 557 | case KVM_CAP_SPAPR_TCE: |
| 545 | case KVM_CAP_SPAPR_TCE_64: | 558 | case KVM_CAP_SPAPR_TCE_64: |
| 546 | /* fallthrough */ | 559 | r = 1; |
| 560 | break; | ||
| 547 | case KVM_CAP_SPAPR_TCE_VFIO: | 561 | case KVM_CAP_SPAPR_TCE_VFIO: |
| 562 | r = !!cpu_has_feature(CPU_FTR_HVMODE); | ||
| 563 | break; | ||
| 548 | case KVM_CAP_PPC_RTAS: | 564 | case KVM_CAP_PPC_RTAS: |
| 549 | case KVM_CAP_PPC_FIXUP_HCALL: | 565 | case KVM_CAP_PPC_FIXUP_HCALL: |
| 550 | case KVM_CAP_PPC_ENABLE_HCALL: | 566 | case KVM_CAP_PPC_ENABLE_HCALL: |
| @@ -696,7 +712,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, | |||
| 696 | const struct kvm_memory_slot *new, | 712 | const struct kvm_memory_slot *new, |
| 697 | enum kvm_mr_change change) | 713 | enum kvm_mr_change change) |
| 698 | { | 714 | { |
| 699 | kvmppc_core_commit_memory_region(kvm, mem, old, new); | 715 | kvmppc_core_commit_memory_region(kvm, mem, old, new, change); |
| 700 | } | 716 | } |
| 701 | 717 | ||
| 702 | void kvm_arch_flush_shadow_memslot(struct kvm *kvm, | 718 | void kvm_arch_flush_shadow_memslot(struct kvm *kvm, |
| @@ -1192,6 +1208,14 @@ static void kvmppc_complete_mmio_load(struct kvm_vcpu *vcpu, | |||
| 1192 | kvmppc_set_vmx_byte(vcpu, gpr); | 1208 | kvmppc_set_vmx_byte(vcpu, gpr); |
| 1193 | break; | 1209 | break; |
| 1194 | #endif | 1210 | #endif |
| 1211 | #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE | ||
| 1212 | case KVM_MMIO_REG_NESTED_GPR: | ||
| 1213 | if (kvmppc_need_byteswap(vcpu)) | ||
| 1214 | gpr = swab64(gpr); | ||
| 1215 | kvm_vcpu_write_guest(vcpu, vcpu->arch.nested_io_gpr, &gpr, | ||
| 1216 | sizeof(gpr)); | ||
| 1217 | break; | ||
| 1218 | #endif | ||
| 1195 | default: | 1219 | default: |
| 1196 | BUG(); | 1220 | BUG(); |
| 1197 | } | 1221 | } |
| @@ -2084,8 +2108,8 @@ int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event, | |||
| 2084 | } | 2108 | } |
| 2085 | 2109 | ||
| 2086 | 2110 | ||
| 2087 | static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, | 2111 | int kvm_vm_ioctl_enable_cap(struct kvm *kvm, |
| 2088 | struct kvm_enable_cap *cap) | 2112 | struct kvm_enable_cap *cap) |
| 2089 | { | 2113 | { |
| 2090 | int r; | 2114 | int r; |
| 2091 | 2115 | ||
| @@ -2273,15 +2297,6 @@ long kvm_arch_vm_ioctl(struct file *filp, | |||
| 2273 | 2297 | ||
| 2274 | break; | 2298 | break; |
| 2275 | } | 2299 | } |
| 2276 | case KVM_ENABLE_CAP: | ||
| 2277 | { | ||
| 2278 | struct kvm_enable_cap cap; | ||
| 2279 | r = -EFAULT; | ||
| 2280 | if (copy_from_user(&cap, argp, sizeof(cap))) | ||
| 2281 | goto out; | ||
| 2282 | r = kvm_vm_ioctl_enable_cap(kvm, &cap); | ||
| 2283 | break; | ||
| 2284 | } | ||
| 2285 | #ifdef CONFIG_SPAPR_TCE_IOMMU | 2300 | #ifdef CONFIG_SPAPR_TCE_IOMMU |
| 2286 | case KVM_CREATE_SPAPR_TCE_64: { | 2301 | case KVM_CREATE_SPAPR_TCE_64: { |
| 2287 | struct kvm_create_spapr_tce_64 create_tce_64; | 2302 | struct kvm_create_spapr_tce_64 create_tce_64; |
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index 1697e903bbf2..2e6fb1d758c3 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c | |||
| @@ -636,6 +636,7 @@ void bad_page_fault(struct pt_regs *regs, unsigned long address, int sig) | |||
| 636 | switch (TRAP(regs)) { | 636 | switch (TRAP(regs)) { |
| 637 | case 0x300: | 637 | case 0x300: |
| 638 | case 0x380: | 638 | case 0x380: |
| 639 | case 0xe00: | ||
| 639 | printk(KERN_ALERT "Unable to handle kernel paging request for " | 640 | printk(KERN_ALERT "Unable to handle kernel paging request for " |
| 640 | "data at address 0x%08lx\n", regs->dar); | 641 | "data at address 0x%08lx\n", regs->dar); |
| 641 | break; | 642 | break; |
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index fe24150ff666..7f4bc58a53b9 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c | |||
| @@ -11,6 +11,9 @@ | |||
| 11 | * Jason J. Herne <jjherne@us.ibm.com> | 11 | * Jason J. Herne <jjherne@us.ibm.com> |
| 12 | */ | 12 | */ |
| 13 | 13 | ||
| 14 | #define KMSG_COMPONENT "kvm-s390" | ||
| 15 | #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt | ||
| 16 | |||
| 14 | #include <linux/compiler.h> | 17 | #include <linux/compiler.h> |
| 15 | #include <linux/err.h> | 18 | #include <linux/err.h> |
| 16 | #include <linux/fs.h> | 19 | #include <linux/fs.h> |
| @@ -44,10 +47,6 @@ | |||
| 44 | #include "kvm-s390.h" | 47 | #include "kvm-s390.h" |
| 45 | #include "gaccess.h" | 48 | #include "gaccess.h" |
| 46 | 49 | ||
| 47 | #define KMSG_COMPONENT "kvm-s390" | ||
| 48 | #undef pr_fmt | ||
| 49 | #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt | ||
| 50 | |||
| 51 | #define CREATE_TRACE_POINTS | 50 | #define CREATE_TRACE_POINTS |
| 52 | #include "trace.h" | 51 | #include "trace.h" |
| 53 | #include "trace-s390.h" | 52 | #include "trace-s390.h" |
| @@ -417,19 +416,30 @@ static void kvm_s390_cpu_feat_init(void) | |||
| 417 | 416 | ||
| 418 | int kvm_arch_init(void *opaque) | 417 | int kvm_arch_init(void *opaque) |
| 419 | { | 418 | { |
| 419 | int rc; | ||
| 420 | |||
| 420 | kvm_s390_dbf = debug_register("kvm-trace", 32, 1, 7 * sizeof(long)); | 421 | kvm_s390_dbf = debug_register("kvm-trace", 32, 1, 7 * sizeof(long)); |
| 421 | if (!kvm_s390_dbf) | 422 | if (!kvm_s390_dbf) |
| 422 | return -ENOMEM; | 423 | return -ENOMEM; |
| 423 | 424 | ||
| 424 | if (debug_register_view(kvm_s390_dbf, &debug_sprintf_view)) { | 425 | if (debug_register_view(kvm_s390_dbf, &debug_sprintf_view)) { |
| 425 | debug_unregister(kvm_s390_dbf); | 426 | rc = -ENOMEM; |
| 426 | return -ENOMEM; | 427 | goto out_debug_unreg; |
| 427 | } | 428 | } |
| 428 | 429 | ||
| 429 | kvm_s390_cpu_feat_init(); | 430 | kvm_s390_cpu_feat_init(); |
| 430 | 431 | ||
| 431 | /* Register floating interrupt controller interface. */ | 432 | /* Register floating interrupt controller interface. */ |
| 432 | return kvm_register_device_ops(&kvm_flic_ops, KVM_DEV_TYPE_FLIC); | 433 | rc = kvm_register_device_ops(&kvm_flic_ops, KVM_DEV_TYPE_FLIC); |
| 434 | if (rc) { | ||
| 435 | pr_err("Failed to register FLIC rc=%d\n", rc); | ||
| 436 | goto out_debug_unreg; | ||
| 437 | } | ||
| 438 | return 0; | ||
| 439 | |||
| 440 | out_debug_unreg: | ||
| 441 | debug_unregister(kvm_s390_dbf); | ||
| 442 | return rc; | ||
| 433 | } | 443 | } |
| 434 | 444 | ||
| 435 | void kvm_arch_exit(void) | 445 | void kvm_arch_exit(void) |
| @@ -464,7 +474,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) | |||
| 464 | case KVM_CAP_S390_CSS_SUPPORT: | 474 | case KVM_CAP_S390_CSS_SUPPORT: |
| 465 | case KVM_CAP_IOEVENTFD: | 475 | case KVM_CAP_IOEVENTFD: |
| 466 | case KVM_CAP_DEVICE_CTRL: | 476 | case KVM_CAP_DEVICE_CTRL: |
| 467 | case KVM_CAP_ENABLE_CAP_VM: | ||
| 468 | case KVM_CAP_S390_IRQCHIP: | 477 | case KVM_CAP_S390_IRQCHIP: |
| 469 | case KVM_CAP_VM_ATTRIBUTES: | 478 | case KVM_CAP_VM_ATTRIBUTES: |
| 470 | case KVM_CAP_MP_STATE: | 479 | case KVM_CAP_MP_STATE: |
| @@ -607,7 +616,7 @@ static void icpt_operexc_on_all_vcpus(struct kvm *kvm) | |||
| 607 | } | 616 | } |
| 608 | } | 617 | } |
| 609 | 618 | ||
| 610 | static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) | 619 | int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) |
| 611 | { | 620 | { |
| 612 | int r; | 621 | int r; |
| 613 | 622 | ||
| @@ -1933,14 +1942,6 @@ long kvm_arch_vm_ioctl(struct file *filp, | |||
| 1933 | r = kvm_s390_inject_vm(kvm, &s390int); | 1942 | r = kvm_s390_inject_vm(kvm, &s390int); |
| 1934 | break; | 1943 | break; |
| 1935 | } | 1944 | } |
| 1936 | case KVM_ENABLE_CAP: { | ||
| 1937 | struct kvm_enable_cap cap; | ||
| 1938 | r = -EFAULT; | ||
| 1939 | if (copy_from_user(&cap, argp, sizeof(cap))) | ||
| 1940 | break; | ||
| 1941 | r = kvm_vm_ioctl_enable_cap(kvm, &cap); | ||
| 1942 | break; | ||
| 1943 | } | ||
| 1944 | case KVM_CREATE_IRQCHIP: { | 1945 | case KVM_CREATE_IRQCHIP: { |
| 1945 | struct kvm_irq_routing_entry routing; | 1946 | struct kvm_irq_routing_entry routing; |
| 1946 | 1947 | ||
diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c index 3a0aa83cbd07..9494ca68fd9d 100644 --- a/arch/x86/events/intel/pt.c +++ b/arch/x86/events/intel/pt.c | |||
| @@ -68,6 +68,7 @@ static struct pt_cap_desc { | |||
| 68 | PT_CAP(topa_output, 0, CPUID_ECX, BIT(0)), | 68 | PT_CAP(topa_output, 0, CPUID_ECX, BIT(0)), |
| 69 | PT_CAP(topa_multiple_entries, 0, CPUID_ECX, BIT(1)), | 69 | PT_CAP(topa_multiple_entries, 0, CPUID_ECX, BIT(1)), |
| 70 | PT_CAP(single_range_output, 0, CPUID_ECX, BIT(2)), | 70 | PT_CAP(single_range_output, 0, CPUID_ECX, BIT(2)), |
| 71 | PT_CAP(output_subsys, 0, CPUID_ECX, BIT(3)), | ||
| 71 | PT_CAP(payloads_lip, 0, CPUID_ECX, BIT(31)), | 72 | PT_CAP(payloads_lip, 0, CPUID_ECX, BIT(31)), |
| 72 | PT_CAP(num_address_ranges, 1, CPUID_EAX, 0x3), | 73 | PT_CAP(num_address_ranges, 1, CPUID_EAX, 0x3), |
| 73 | PT_CAP(mtc_periods, 1, CPUID_EAX, 0xffff0000), | 74 | PT_CAP(mtc_periods, 1, CPUID_EAX, 0xffff0000), |
| @@ -75,14 +76,21 @@ static struct pt_cap_desc { | |||
| 75 | PT_CAP(psb_periods, 1, CPUID_EBX, 0xffff0000), | 76 | PT_CAP(psb_periods, 1, CPUID_EBX, 0xffff0000), |
| 76 | }; | 77 | }; |
| 77 | 78 | ||
| 78 | static u32 pt_cap_get(enum pt_capabilities cap) | 79 | u32 intel_pt_validate_cap(u32 *caps, enum pt_capabilities capability) |
| 79 | { | 80 | { |
| 80 | struct pt_cap_desc *cd = &pt_caps[cap]; | 81 | struct pt_cap_desc *cd = &pt_caps[capability]; |
| 81 | u32 c = pt_pmu.caps[cd->leaf * PT_CPUID_REGS_NUM + cd->reg]; | 82 | u32 c = caps[cd->leaf * PT_CPUID_REGS_NUM + cd->reg]; |
| 82 | unsigned int shift = __ffs(cd->mask); | 83 | unsigned int shift = __ffs(cd->mask); |
| 83 | 84 | ||
| 84 | return (c & cd->mask) >> shift; | 85 | return (c & cd->mask) >> shift; |
| 85 | } | 86 | } |
| 87 | EXPORT_SYMBOL_GPL(intel_pt_validate_cap); | ||
| 88 | |||
| 89 | u32 intel_pt_validate_hw_cap(enum pt_capabilities cap) | ||
| 90 | { | ||
| 91 | return intel_pt_validate_cap(pt_pmu.caps, cap); | ||
| 92 | } | ||
| 93 | EXPORT_SYMBOL_GPL(intel_pt_validate_hw_cap); | ||
| 86 | 94 | ||
| 87 | static ssize_t pt_cap_show(struct device *cdev, | 95 | static ssize_t pt_cap_show(struct device *cdev, |
| 88 | struct device_attribute *attr, | 96 | struct device_attribute *attr, |
| @@ -92,7 +100,7 @@ static ssize_t pt_cap_show(struct device *cdev, | |||
| 92 | container_of(attr, struct dev_ext_attribute, attr); | 100 | container_of(attr, struct dev_ext_attribute, attr); |
| 93 | enum pt_capabilities cap = (long)ea->var; | 101 | enum pt_capabilities cap = (long)ea->var; |
| 94 | 102 | ||
| 95 | return snprintf(buf, PAGE_SIZE, "%x\n", pt_cap_get(cap)); | 103 | return snprintf(buf, PAGE_SIZE, "%x\n", intel_pt_validate_hw_cap(cap)); |
| 96 | } | 104 | } |
| 97 | 105 | ||
| 98 | static struct attribute_group pt_cap_group __ro_after_init = { | 106 | static struct attribute_group pt_cap_group __ro_after_init = { |
| @@ -310,16 +318,16 @@ static bool pt_event_valid(struct perf_event *event) | |||
| 310 | return false; | 318 | return false; |
| 311 | 319 | ||
| 312 | if (config & RTIT_CTL_CYC_PSB) { | 320 | if (config & RTIT_CTL_CYC_PSB) { |
| 313 | if (!pt_cap_get(PT_CAP_psb_cyc)) | 321 | if (!intel_pt_validate_hw_cap(PT_CAP_psb_cyc)) |
| 314 | return false; | 322 | return false; |
| 315 | 323 | ||
| 316 | allowed = pt_cap_get(PT_CAP_psb_periods); | 324 | allowed = intel_pt_validate_hw_cap(PT_CAP_psb_periods); |
| 317 | requested = (config & RTIT_CTL_PSB_FREQ) >> | 325 | requested = (config & RTIT_CTL_PSB_FREQ) >> |
| 318 | RTIT_CTL_PSB_FREQ_OFFSET; | 326 | RTIT_CTL_PSB_FREQ_OFFSET; |
| 319 | if (requested && (!(allowed & BIT(requested)))) | 327 | if (requested && (!(allowed & BIT(requested)))) |
| 320 | return false; | 328 | return false; |
| 321 | 329 | ||
| 322 | allowed = pt_cap_get(PT_CAP_cycle_thresholds); | 330 | allowed = intel_pt_validate_hw_cap(PT_CAP_cycle_thresholds); |
| 323 | requested = (config & RTIT_CTL_CYC_THRESH) >> | 331 | requested = (config & RTIT_CTL_CYC_THRESH) >> |
| 324 | RTIT_CTL_CYC_THRESH_OFFSET; | 332 | RTIT_CTL_CYC_THRESH_OFFSET; |
| 325 | if (requested && (!(allowed & BIT(requested)))) | 333 | if (requested && (!(allowed & BIT(requested)))) |
| @@ -334,10 +342,10 @@ static bool pt_event_valid(struct perf_event *event) | |||
| 334 | * Spec says that setting mtc period bits while mtc bit in | 342 | * Spec says that setting mtc period bits while mtc bit in |
| 335 | * CPUID is 0 will #GP, so better safe than sorry. | 343 | * CPUID is 0 will #GP, so better safe than sorry. |
| 336 | */ | 344 | */ |
| 337 | if (!pt_cap_get(PT_CAP_mtc)) | 345 | if (!intel_pt_validate_hw_cap(PT_CAP_mtc)) |
| 338 | return false; | 346 | return false; |
| 339 | 347 | ||
| 340 | allowed = pt_cap_get(PT_CAP_mtc_periods); | 348 | allowed = intel_pt_validate_hw_cap(PT_CAP_mtc_periods); |
| 341 | if (!allowed) | 349 | if (!allowed) |
| 342 | return false; | 350 | return false; |
| 343 | 351 | ||
| @@ -349,11 +357,11 @@ static bool pt_event_valid(struct perf_event *event) | |||
| 349 | } | 357 | } |
| 350 | 358 | ||
| 351 | if (config & RTIT_CTL_PWR_EVT_EN && | 359 | if (config & RTIT_CTL_PWR_EVT_EN && |
| 352 | !pt_cap_get(PT_CAP_power_event_trace)) | 360 | !intel_pt_validate_hw_cap(PT_CAP_power_event_trace)) |
| 353 | return false; | 361 | return false; |
| 354 | 362 | ||
| 355 | if (config & RTIT_CTL_PTW) { | 363 | if (config & RTIT_CTL_PTW) { |
| 356 | if (!pt_cap_get(PT_CAP_ptwrite)) | 364 | if (!intel_pt_validate_hw_cap(PT_CAP_ptwrite)) |
| 357 | return false; | 365 | return false; |
| 358 | 366 | ||
| 359 | /* FUPonPTW without PTW doesn't make sense */ | 367 | /* FUPonPTW without PTW doesn't make sense */ |
| @@ -598,7 +606,7 @@ static struct topa *topa_alloc(int cpu, gfp_t gfp) | |||
| 598 | * In case of singe-entry ToPA, always put the self-referencing END | 606 | * In case of singe-entry ToPA, always put the self-referencing END |
| 599 | * link as the 2nd entry in the table | 607 | * link as the 2nd entry in the table |
| 600 | */ | 608 | */ |
| 601 | if (!pt_cap_get(PT_CAP_topa_multiple_entries)) { | 609 | if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries)) { |
| 602 | TOPA_ENTRY(topa, 1)->base = topa->phys >> TOPA_SHIFT; | 610 | TOPA_ENTRY(topa, 1)->base = topa->phys >> TOPA_SHIFT; |
| 603 | TOPA_ENTRY(topa, 1)->end = 1; | 611 | TOPA_ENTRY(topa, 1)->end = 1; |
| 604 | } | 612 | } |
| @@ -638,7 +646,7 @@ static void topa_insert_table(struct pt_buffer *buf, struct topa *topa) | |||
| 638 | topa->offset = last->offset + last->size; | 646 | topa->offset = last->offset + last->size; |
| 639 | buf->last = topa; | 647 | buf->last = topa; |
| 640 | 648 | ||
| 641 | if (!pt_cap_get(PT_CAP_topa_multiple_entries)) | 649 | if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries)) |
| 642 | return; | 650 | return; |
| 643 | 651 | ||
| 644 | BUG_ON(last->last != TENTS_PER_PAGE - 1); | 652 | BUG_ON(last->last != TENTS_PER_PAGE - 1); |
| @@ -654,7 +662,7 @@ static void topa_insert_table(struct pt_buffer *buf, struct topa *topa) | |||
| 654 | static bool topa_table_full(struct topa *topa) | 662 | static bool topa_table_full(struct topa *topa) |
| 655 | { | 663 | { |
| 656 | /* single-entry ToPA is a special case */ | 664 | /* single-entry ToPA is a special case */ |
| 657 | if (!pt_cap_get(PT_CAP_topa_multiple_entries)) | 665 | if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries)) |
| 658 | return !!topa->last; | 666 | return !!topa->last; |
| 659 | 667 | ||
| 660 | return topa->last == TENTS_PER_PAGE - 1; | 668 | return topa->last == TENTS_PER_PAGE - 1; |
| @@ -690,7 +698,8 @@ static int topa_insert_pages(struct pt_buffer *buf, gfp_t gfp) | |||
| 690 | 698 | ||
| 691 | TOPA_ENTRY(topa, -1)->base = page_to_phys(p) >> TOPA_SHIFT; | 699 | TOPA_ENTRY(topa, -1)->base = page_to_phys(p) >> TOPA_SHIFT; |
| 692 | TOPA_ENTRY(topa, -1)->size = order; | 700 | TOPA_ENTRY(topa, -1)->size = order; |
| 693 | if (!buf->snapshot && !pt_cap_get(PT_CAP_topa_multiple_entries)) { | 701 | if (!buf->snapshot && |
| 702 | !intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries)) { | ||
| 694 | TOPA_ENTRY(topa, -1)->intr = 1; | 703 | TOPA_ENTRY(topa, -1)->intr = 1; |
| 695 | TOPA_ENTRY(topa, -1)->stop = 1; | 704 | TOPA_ENTRY(topa, -1)->stop = 1; |
| 696 | } | 705 | } |
| @@ -725,7 +734,7 @@ static void pt_topa_dump(struct pt_buffer *buf) | |||
| 725 | topa->table[i].intr ? 'I' : ' ', | 734 | topa->table[i].intr ? 'I' : ' ', |
| 726 | topa->table[i].stop ? 'S' : ' ', | 735 | topa->table[i].stop ? 'S' : ' ', |
| 727 | *(u64 *)&topa->table[i]); | 736 | *(u64 *)&topa->table[i]); |
| 728 | if ((pt_cap_get(PT_CAP_topa_multiple_entries) && | 737 | if ((intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries) && |
| 729 | topa->table[i].stop) || | 738 | topa->table[i].stop) || |
| 730 | topa->table[i].end) | 739 | topa->table[i].end) |
| 731 | break; | 740 | break; |
| @@ -828,7 +837,7 @@ static void pt_handle_status(struct pt *pt) | |||
| 828 | * means we are already losing data; need to let the decoder | 837 | * means we are already losing data; need to let the decoder |
| 829 | * know. | 838 | * know. |
| 830 | */ | 839 | */ |
| 831 | if (!pt_cap_get(PT_CAP_topa_multiple_entries) || | 840 | if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries) || |
| 832 | buf->output_off == sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size)) { | 841 | buf->output_off == sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size)) { |
| 833 | perf_aux_output_flag(&pt->handle, | 842 | perf_aux_output_flag(&pt->handle, |
| 834 | PERF_AUX_FLAG_TRUNCATED); | 843 | PERF_AUX_FLAG_TRUNCATED); |
| @@ -840,7 +849,8 @@ static void pt_handle_status(struct pt *pt) | |||
| 840 | * Also on single-entry ToPA implementations, interrupt will come | 849 | * Also on single-entry ToPA implementations, interrupt will come |
| 841 | * before the output reaches its output region's boundary. | 850 | * before the output reaches its output region's boundary. |
| 842 | */ | 851 | */ |
| 843 | if (!pt_cap_get(PT_CAP_topa_multiple_entries) && !buf->snapshot && | 852 | if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries) && |
| 853 | !buf->snapshot && | ||
| 844 | pt_buffer_region_size(buf) - buf->output_off <= TOPA_PMI_MARGIN) { | 854 | pt_buffer_region_size(buf) - buf->output_off <= TOPA_PMI_MARGIN) { |
| 845 | void *head = pt_buffer_region(buf); | 855 | void *head = pt_buffer_region(buf); |
| 846 | 856 | ||
| @@ -931,7 +941,7 @@ static int pt_buffer_reset_markers(struct pt_buffer *buf, | |||
| 931 | 941 | ||
| 932 | 942 | ||
| 933 | /* single entry ToPA is handled by marking all regions STOP=1 INT=1 */ | 943 | /* single entry ToPA is handled by marking all regions STOP=1 INT=1 */ |
| 934 | if (!pt_cap_get(PT_CAP_topa_multiple_entries)) | 944 | if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries)) |
| 935 | return 0; | 945 | return 0; |
| 936 | 946 | ||
| 937 | /* clear STOP and INT from current entry */ | 947 | /* clear STOP and INT from current entry */ |
| @@ -1082,7 +1092,7 @@ static int pt_buffer_init_topa(struct pt_buffer *buf, unsigned long nr_pages, | |||
| 1082 | pt_buffer_setup_topa_index(buf); | 1092 | pt_buffer_setup_topa_index(buf); |
| 1083 | 1093 | ||
| 1084 | /* link last table to the first one, unless we're double buffering */ | 1094 | /* link last table to the first one, unless we're double buffering */ |
| 1085 | if (pt_cap_get(PT_CAP_topa_multiple_entries)) { | 1095 | if (intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries)) { |
| 1086 | TOPA_ENTRY(buf->last, -1)->base = buf->first->phys >> TOPA_SHIFT; | 1096 | TOPA_ENTRY(buf->last, -1)->base = buf->first->phys >> TOPA_SHIFT; |
| 1087 | TOPA_ENTRY(buf->last, -1)->end = 1; | 1097 | TOPA_ENTRY(buf->last, -1)->end = 1; |
| 1088 | } | 1098 | } |
| @@ -1153,7 +1163,7 @@ static int pt_addr_filters_init(struct perf_event *event) | |||
| 1153 | struct pt_filters *filters; | 1163 | struct pt_filters *filters; |
| 1154 | int node = event->cpu == -1 ? -1 : cpu_to_node(event->cpu); | 1164 | int node = event->cpu == -1 ? -1 : cpu_to_node(event->cpu); |
| 1155 | 1165 | ||
| 1156 | if (!pt_cap_get(PT_CAP_num_address_ranges)) | 1166 | if (!intel_pt_validate_hw_cap(PT_CAP_num_address_ranges)) |
| 1157 | return 0; | 1167 | return 0; |
| 1158 | 1168 | ||
| 1159 | filters = kzalloc_node(sizeof(struct pt_filters), GFP_KERNEL, node); | 1169 | filters = kzalloc_node(sizeof(struct pt_filters), GFP_KERNEL, node); |
| @@ -1202,7 +1212,7 @@ static int pt_event_addr_filters_validate(struct list_head *filters) | |||
| 1202 | return -EINVAL; | 1212 | return -EINVAL; |
| 1203 | } | 1213 | } |
| 1204 | 1214 | ||
| 1205 | if (++range > pt_cap_get(PT_CAP_num_address_ranges)) | 1215 | if (++range > intel_pt_validate_hw_cap(PT_CAP_num_address_ranges)) |
| 1206 | return -EOPNOTSUPP; | 1216 | return -EOPNOTSUPP; |
| 1207 | } | 1217 | } |
| 1208 | 1218 | ||
| @@ -1507,12 +1517,12 @@ static __init int pt_init(void) | |||
| 1507 | if (ret) | 1517 | if (ret) |
| 1508 | return ret; | 1518 | return ret; |
| 1509 | 1519 | ||
| 1510 | if (!pt_cap_get(PT_CAP_topa_output)) { | 1520 | if (!intel_pt_validate_hw_cap(PT_CAP_topa_output)) { |
| 1511 | pr_warn("ToPA output is not supported on this CPU\n"); | 1521 | pr_warn("ToPA output is not supported on this CPU\n"); |
| 1512 | return -ENODEV; | 1522 | return -ENODEV; |
| 1513 | } | 1523 | } |
| 1514 | 1524 | ||
| 1515 | if (!pt_cap_get(PT_CAP_topa_multiple_entries)) | 1525 | if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries)) |
| 1516 | pt_pmu.pmu.capabilities = | 1526 | pt_pmu.pmu.capabilities = |
| 1517 | PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_AUX_SW_DOUBLEBUF; | 1527 | PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_AUX_SW_DOUBLEBUF; |
| 1518 | 1528 | ||
| @@ -1530,7 +1540,7 @@ static __init int pt_init(void) | |||
| 1530 | pt_pmu.pmu.addr_filters_sync = pt_event_addr_filters_sync; | 1540 | pt_pmu.pmu.addr_filters_sync = pt_event_addr_filters_sync; |
| 1531 | pt_pmu.pmu.addr_filters_validate = pt_event_addr_filters_validate; | 1541 | pt_pmu.pmu.addr_filters_validate = pt_event_addr_filters_validate; |
| 1532 | pt_pmu.pmu.nr_addr_filters = | 1542 | pt_pmu.pmu.nr_addr_filters = |
| 1533 | pt_cap_get(PT_CAP_num_address_ranges); | 1543 | intel_pt_validate_hw_cap(PT_CAP_num_address_ranges); |
| 1534 | 1544 | ||
| 1535 | ret = perf_pmu_register(&pt_pmu.pmu, "intel_pt", -1); | 1545 | ret = perf_pmu_register(&pt_pmu.pmu, "intel_pt", -1); |
| 1536 | 1546 | ||
diff --git a/arch/x86/events/intel/pt.h b/arch/x86/events/intel/pt.h index 0eb41d07b79a..269e15a9086c 100644 --- a/arch/x86/events/intel/pt.h +++ b/arch/x86/events/intel/pt.h | |||
| @@ -20,43 +20,6 @@ | |||
| 20 | #define __INTEL_PT_H__ | 20 | #define __INTEL_PT_H__ |
| 21 | 21 | ||
| 22 | /* | 22 | /* |
| 23 | * PT MSR bit definitions | ||
| 24 | */ | ||
| 25 | #define RTIT_CTL_TRACEEN BIT(0) | ||
| 26 | #define RTIT_CTL_CYCLEACC BIT(1) | ||
| 27 | #define RTIT_CTL_OS BIT(2) | ||
| 28 | #define RTIT_CTL_USR BIT(3) | ||
| 29 | #define RTIT_CTL_PWR_EVT_EN BIT(4) | ||
| 30 | #define RTIT_CTL_FUP_ON_PTW BIT(5) | ||
| 31 | #define RTIT_CTL_CR3EN BIT(7) | ||
| 32 | #define RTIT_CTL_TOPA BIT(8) | ||
| 33 | #define RTIT_CTL_MTC_EN BIT(9) | ||
| 34 | #define RTIT_CTL_TSC_EN BIT(10) | ||
| 35 | #define RTIT_CTL_DISRETC BIT(11) | ||
| 36 | #define RTIT_CTL_PTW_EN BIT(12) | ||
| 37 | #define RTIT_CTL_BRANCH_EN BIT(13) | ||
| 38 | #define RTIT_CTL_MTC_RANGE_OFFSET 14 | ||
| 39 | #define RTIT_CTL_MTC_RANGE (0x0full << RTIT_CTL_MTC_RANGE_OFFSET) | ||
| 40 | #define RTIT_CTL_CYC_THRESH_OFFSET 19 | ||
| 41 | #define RTIT_CTL_CYC_THRESH (0x0full << RTIT_CTL_CYC_THRESH_OFFSET) | ||
| 42 | #define RTIT_CTL_PSB_FREQ_OFFSET 24 | ||
| 43 | #define RTIT_CTL_PSB_FREQ (0x0full << RTIT_CTL_PSB_FREQ_OFFSET) | ||
| 44 | #define RTIT_CTL_ADDR0_OFFSET 32 | ||
| 45 | #define RTIT_CTL_ADDR0 (0x0full << RTIT_CTL_ADDR0_OFFSET) | ||
| 46 | #define RTIT_CTL_ADDR1_OFFSET 36 | ||
| 47 | #define RTIT_CTL_ADDR1 (0x0full << RTIT_CTL_ADDR1_OFFSET) | ||
| 48 | #define RTIT_CTL_ADDR2_OFFSET 40 | ||
| 49 | #define RTIT_CTL_ADDR2 (0x0full << RTIT_CTL_ADDR2_OFFSET) | ||
| 50 | #define RTIT_CTL_ADDR3_OFFSET 44 | ||
| 51 | #define RTIT_CTL_ADDR3 (0x0full << RTIT_CTL_ADDR3_OFFSET) | ||
| 52 | #define RTIT_STATUS_FILTEREN BIT(0) | ||
| 53 | #define RTIT_STATUS_CONTEXTEN BIT(1) | ||
| 54 | #define RTIT_STATUS_TRIGGEREN BIT(2) | ||
| 55 | #define RTIT_STATUS_BUFFOVF BIT(3) | ||
| 56 | #define RTIT_STATUS_ERROR BIT(4) | ||
| 57 | #define RTIT_STATUS_STOPPED BIT(5) | ||
| 58 | |||
| 59 | /* | ||
| 60 | * Single-entry ToPA: when this close to region boundary, switch | 23 | * Single-entry ToPA: when this close to region boundary, switch |
| 61 | * buffers to avoid losing data. | 24 | * buffers to avoid losing data. |
| 62 | */ | 25 | */ |
| @@ -82,30 +45,9 @@ struct topa_entry { | |||
| 82 | u64 rsvd4 : 16; | 45 | u64 rsvd4 : 16; |
| 83 | }; | 46 | }; |
| 84 | 47 | ||
| 85 | #define PT_CPUID_LEAVES 2 | ||
| 86 | #define PT_CPUID_REGS_NUM 4 /* number of regsters (eax, ebx, ecx, edx) */ | ||
| 87 | |||
| 88 | /* TSC to Core Crystal Clock Ratio */ | 48 | /* TSC to Core Crystal Clock Ratio */ |
| 89 | #define CPUID_TSC_LEAF 0x15 | 49 | #define CPUID_TSC_LEAF 0x15 |
| 90 | 50 | ||
| 91 | enum pt_capabilities { | ||
| 92 | PT_CAP_max_subleaf = 0, | ||
| 93 | PT_CAP_cr3_filtering, | ||
| 94 | PT_CAP_psb_cyc, | ||
| 95 | PT_CAP_ip_filtering, | ||
| 96 | PT_CAP_mtc, | ||
| 97 | PT_CAP_ptwrite, | ||
| 98 | PT_CAP_power_event_trace, | ||
| 99 | PT_CAP_topa_output, | ||
| 100 | PT_CAP_topa_multiple_entries, | ||
| 101 | PT_CAP_single_range_output, | ||
| 102 | PT_CAP_payloads_lip, | ||
| 103 | PT_CAP_num_address_ranges, | ||
| 104 | PT_CAP_mtc_periods, | ||
| 105 | PT_CAP_cycle_thresholds, | ||
| 106 | PT_CAP_psb_periods, | ||
| 107 | }; | ||
| 108 | |||
| 109 | struct pt_pmu { | 51 | struct pt_pmu { |
| 110 | struct pmu pmu; | 52 | struct pmu pmu; |
| 111 | u32 caps[PT_CPUID_REGS_NUM * PT_CPUID_LEAVES]; | 53 | u32 caps[PT_CPUID_REGS_NUM * PT_CPUID_LEAVES]; |
diff --git a/arch/x86/hyperv/nested.c b/arch/x86/hyperv/nested.c index b8e60cc50461..dd0a843f766d 100644 --- a/arch/x86/hyperv/nested.c +++ b/arch/x86/hyperv/nested.c | |||
| @@ -7,6 +7,7 @@ | |||
| 7 | * | 7 | * |
| 8 | * Author : Lan Tianyu <Tianyu.Lan@microsoft.com> | 8 | * Author : Lan Tianyu <Tianyu.Lan@microsoft.com> |
| 9 | */ | 9 | */ |
| 10 | #define pr_fmt(fmt) "Hyper-V: " fmt | ||
| 10 | 11 | ||
| 11 | 12 | ||
| 12 | #include <linux/types.h> | 13 | #include <linux/types.h> |
| @@ -54,3 +55,82 @@ fault: | |||
| 54 | return ret; | 55 | return ret; |
| 55 | } | 56 | } |
| 56 | EXPORT_SYMBOL_GPL(hyperv_flush_guest_mapping); | 57 | EXPORT_SYMBOL_GPL(hyperv_flush_guest_mapping); |
| 58 | |||
| 59 | int hyperv_fill_flush_guest_mapping_list( | ||
| 60 | struct hv_guest_mapping_flush_list *flush, | ||
| 61 | u64 start_gfn, u64 pages) | ||
| 62 | { | ||
| 63 | u64 cur = start_gfn; | ||
| 64 | u64 additional_pages; | ||
| 65 | int gpa_n = 0; | ||
| 66 | |||
| 67 | do { | ||
| 68 | /* | ||
| 69 | * If flush requests exceed max flush count, go back to | ||
| 70 | * flush tlbs without range. | ||
| 71 | */ | ||
| 72 | if (gpa_n >= HV_MAX_FLUSH_REP_COUNT) | ||
| 73 | return -ENOSPC; | ||
| 74 | |||
| 75 | additional_pages = min_t(u64, pages, HV_MAX_FLUSH_PAGES) - 1; | ||
| 76 | |||
| 77 | flush->gpa_list[gpa_n].page.additional_pages = additional_pages; | ||
| 78 | flush->gpa_list[gpa_n].page.largepage = false; | ||
| 79 | flush->gpa_list[gpa_n].page.basepfn = cur; | ||
| 80 | |||
| 81 | pages -= additional_pages + 1; | ||
| 82 | cur += additional_pages + 1; | ||
| 83 | gpa_n++; | ||
| 84 | } while (pages > 0); | ||
| 85 | |||
| 86 | return gpa_n; | ||
| 87 | } | ||
| 88 | EXPORT_SYMBOL_GPL(hyperv_fill_flush_guest_mapping_list); | ||
| 89 | |||
| 90 | int hyperv_flush_guest_mapping_range(u64 as, | ||
| 91 | hyperv_fill_flush_list_func fill_flush_list_func, void *data) | ||
| 92 | { | ||
| 93 | struct hv_guest_mapping_flush_list **flush_pcpu; | ||
| 94 | struct hv_guest_mapping_flush_list *flush; | ||
| 95 | u64 status = 0; | ||
| 96 | unsigned long flags; | ||
| 97 | int ret = -ENOTSUPP; | ||
| 98 | int gpa_n = 0; | ||
| 99 | |||
| 100 | if (!hv_hypercall_pg || !fill_flush_list_func) | ||
| 101 | goto fault; | ||
| 102 | |||
| 103 | local_irq_save(flags); | ||
| 104 | |||
| 105 | flush_pcpu = (struct hv_guest_mapping_flush_list **) | ||
| 106 | this_cpu_ptr(hyperv_pcpu_input_arg); | ||
| 107 | |||
| 108 | flush = *flush_pcpu; | ||
| 109 | if (unlikely(!flush)) { | ||
| 110 | local_irq_restore(flags); | ||
| 111 | goto fault; | ||
| 112 | } | ||
| 113 | |||
| 114 | flush->address_space = as; | ||
| 115 | flush->flags = 0; | ||
| 116 | |||
| 117 | gpa_n = fill_flush_list_func(flush, data); | ||
| 118 | if (gpa_n < 0) { | ||
| 119 | local_irq_restore(flags); | ||
| 120 | goto fault; | ||
| 121 | } | ||
| 122 | |||
| 123 | status = hv_do_rep_hypercall(HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_LIST, | ||
| 124 | gpa_n, 0, flush, NULL); | ||
| 125 | |||
| 126 | local_irq_restore(flags); | ||
| 127 | |||
| 128 | if (!(status & HV_HYPERCALL_RESULT_MASK)) | ||
| 129 | ret = 0; | ||
| 130 | else | ||
| 131 | ret = status; | ||
| 132 | fault: | ||
| 133 | trace_hyperv_nested_flush_guest_mapping_range(as, ret); | ||
| 134 | return ret; | ||
| 135 | } | ||
| 136 | EXPORT_SYMBOL_GPL(hyperv_flush_guest_mapping_range); | ||
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index df8e94e2f7be..6d6122524711 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h | |||
| @@ -281,6 +281,7 @@ | |||
| 281 | #define X86_FEATURE_CLZERO (13*32+ 0) /* CLZERO instruction */ | 281 | #define X86_FEATURE_CLZERO (13*32+ 0) /* CLZERO instruction */ |
| 282 | #define X86_FEATURE_IRPERF (13*32+ 1) /* Instructions Retired Count */ | 282 | #define X86_FEATURE_IRPERF (13*32+ 1) /* Instructions Retired Count */ |
| 283 | #define X86_FEATURE_XSAVEERPTR (13*32+ 2) /* Always save/restore FP error pointers */ | 283 | #define X86_FEATURE_XSAVEERPTR (13*32+ 2) /* Always save/restore FP error pointers */ |
| 284 | #define X86_FEATURE_WBNOINVD (13*32+ 9) /* WBNOINVD instruction */ | ||
| 284 | #define X86_FEATURE_AMD_IBPB (13*32+12) /* "" Indirect Branch Prediction Barrier */ | 285 | #define X86_FEATURE_AMD_IBPB (13*32+12) /* "" Indirect Branch Prediction Barrier */ |
| 285 | #define X86_FEATURE_AMD_IBRS (13*32+14) /* "" Indirect Branch Restricted Speculation */ | 286 | #define X86_FEATURE_AMD_IBRS (13*32+14) /* "" Indirect Branch Restricted Speculation */ |
| 286 | #define X86_FEATURE_AMD_STIBP (13*32+15) /* "" Single Thread Indirect Branch Predictors */ | 287 | #define X86_FEATURE_AMD_STIBP (13*32+15) /* "" Single Thread Indirect Branch Predictors */ |
diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h index 4139f7650fe5..705dafc2d11a 100644 --- a/arch/x86/include/asm/hyperv-tlfs.h +++ b/arch/x86/include/asm/hyperv-tlfs.h | |||
| @@ -10,6 +10,7 @@ | |||
| 10 | #define _ASM_X86_HYPERV_TLFS_H | 10 | #define _ASM_X86_HYPERV_TLFS_H |
| 11 | 11 | ||
| 12 | #include <linux/types.h> | 12 | #include <linux/types.h> |
| 13 | #include <asm/page.h> | ||
| 13 | 14 | ||
| 14 | /* | 15 | /* |
| 15 | * The below CPUID leaves are present if VersionAndFeatures.HypervisorPresent | 16 | * The below CPUID leaves are present if VersionAndFeatures.HypervisorPresent |
| @@ -30,158 +31,150 @@ | |||
| 30 | /* | 31 | /* |
| 31 | * Feature identification. EAX indicates which features are available | 32 | * Feature identification. EAX indicates which features are available |
| 32 | * to the partition based upon the current partition privileges. | 33 | * to the partition based upon the current partition privileges. |
| 34 | * These are HYPERV_CPUID_FEATURES.EAX bits. | ||
| 33 | */ | 35 | */ |
| 34 | 36 | ||
| 35 | /* VP Runtime (HV_X64_MSR_VP_RUNTIME) available */ | 37 | /* VP Runtime (HV_X64_MSR_VP_RUNTIME) available */ |
| 36 | #define HV_X64_MSR_VP_RUNTIME_AVAILABLE (1 << 0) | 38 | #define HV_X64_MSR_VP_RUNTIME_AVAILABLE BIT(0) |
| 37 | /* Partition Reference Counter (HV_X64_MSR_TIME_REF_COUNT) available*/ | 39 | /* Partition Reference Counter (HV_X64_MSR_TIME_REF_COUNT) available*/ |
| 38 | #define HV_MSR_TIME_REF_COUNT_AVAILABLE (1 << 1) | 40 | #define HV_MSR_TIME_REF_COUNT_AVAILABLE BIT(1) |
| 39 | /* Partition reference TSC MSR is available */ | ||
| 40 | #define HV_MSR_REFERENCE_TSC_AVAILABLE (1 << 9) | ||
| 41 | /* Partition Guest IDLE MSR is available */ | ||
| 42 | #define HV_X64_MSR_GUEST_IDLE_AVAILABLE (1 << 10) | ||
| 43 | |||
| 44 | /* A partition's reference time stamp counter (TSC) page */ | ||
| 45 | #define HV_X64_MSR_REFERENCE_TSC 0x40000021 | ||
| 46 | |||
| 47 | /* | ||
| 48 | * There is a single feature flag that signifies if the partition has access | ||
| 49 | * to MSRs with local APIC and TSC frequencies. | ||
| 50 | */ | ||
| 51 | #define HV_X64_ACCESS_FREQUENCY_MSRS (1 << 11) | ||
| 52 | |||
| 53 | /* AccessReenlightenmentControls privilege */ | ||
| 54 | #define HV_X64_ACCESS_REENLIGHTENMENT BIT(13) | ||
| 55 | |||
| 56 | /* | 41 | /* |
| 57 | * Basic SynIC MSRs (HV_X64_MSR_SCONTROL through HV_X64_MSR_EOM | 42 | * Basic SynIC MSRs (HV_X64_MSR_SCONTROL through HV_X64_MSR_EOM |
| 58 | * and HV_X64_MSR_SINT0 through HV_X64_MSR_SINT15) available | 43 | * and HV_X64_MSR_SINT0 through HV_X64_MSR_SINT15) available |
| 59 | */ | 44 | */ |
| 60 | #define HV_X64_MSR_SYNIC_AVAILABLE (1 << 2) | 45 | #define HV_X64_MSR_SYNIC_AVAILABLE BIT(2) |
| 61 | /* | 46 | /* |
| 62 | * Synthetic Timer MSRs (HV_X64_MSR_STIMER0_CONFIG through | 47 | * Synthetic Timer MSRs (HV_X64_MSR_STIMER0_CONFIG through |
| 63 | * HV_X64_MSR_STIMER3_COUNT) available | 48 | * HV_X64_MSR_STIMER3_COUNT) available |
| 64 | */ | 49 | */ |
| 65 | #define HV_MSR_SYNTIMER_AVAILABLE (1 << 3) | 50 | #define HV_MSR_SYNTIMER_AVAILABLE BIT(3) |
| 66 | /* | 51 | /* |
| 67 | * APIC access MSRs (HV_X64_MSR_EOI, HV_X64_MSR_ICR and HV_X64_MSR_TPR) | 52 | * APIC access MSRs (HV_X64_MSR_EOI, HV_X64_MSR_ICR and HV_X64_MSR_TPR) |
| 68 | * are available | 53 | * are available |
| 69 | */ | 54 | */ |
| 70 | #define HV_X64_MSR_APIC_ACCESS_AVAILABLE (1 << 4) | 55 | #define HV_X64_MSR_APIC_ACCESS_AVAILABLE BIT(4) |
| 71 | /* Hypercall MSRs (HV_X64_MSR_GUEST_OS_ID and HV_X64_MSR_HYPERCALL) available*/ | 56 | /* Hypercall MSRs (HV_X64_MSR_GUEST_OS_ID and HV_X64_MSR_HYPERCALL) available*/ |
| 72 | #define HV_X64_MSR_HYPERCALL_AVAILABLE (1 << 5) | 57 | #define HV_X64_MSR_HYPERCALL_AVAILABLE BIT(5) |
| 73 | /* Access virtual processor index MSR (HV_X64_MSR_VP_INDEX) available*/ | 58 | /* Access virtual processor index MSR (HV_X64_MSR_VP_INDEX) available*/ |
| 74 | #define HV_X64_MSR_VP_INDEX_AVAILABLE (1 << 6) | 59 | #define HV_X64_MSR_VP_INDEX_AVAILABLE BIT(6) |
| 75 | /* Virtual system reset MSR (HV_X64_MSR_RESET) is available*/ | 60 | /* Virtual system reset MSR (HV_X64_MSR_RESET) is available*/ |
| 76 | #define HV_X64_MSR_RESET_AVAILABLE (1 << 7) | 61 | #define HV_X64_MSR_RESET_AVAILABLE BIT(7) |
| 77 | /* | 62 | /* |
| 78 | * Access statistics pages MSRs (HV_X64_MSR_STATS_PARTITION_RETAIL_PAGE, | 63 | * Access statistics pages MSRs (HV_X64_MSR_STATS_PARTITION_RETAIL_PAGE, |
| 79 | * HV_X64_MSR_STATS_PARTITION_INTERNAL_PAGE, HV_X64_MSR_STATS_VP_RETAIL_PAGE, | 64 | * HV_X64_MSR_STATS_PARTITION_INTERNAL_PAGE, HV_X64_MSR_STATS_VP_RETAIL_PAGE, |
| 80 | * HV_X64_MSR_STATS_VP_INTERNAL_PAGE) available | 65 | * HV_X64_MSR_STATS_VP_INTERNAL_PAGE) available |
| 81 | */ | 66 | */ |
| 82 | #define HV_X64_MSR_STAT_PAGES_AVAILABLE (1 << 8) | 67 | #define HV_X64_MSR_STAT_PAGES_AVAILABLE BIT(8) |
| 83 | 68 | /* Partition reference TSC MSR is available */ | |
| 84 | /* Frequency MSRs available */ | 69 | #define HV_MSR_REFERENCE_TSC_AVAILABLE BIT(9) |
| 85 | #define HV_FEATURE_FREQUENCY_MSRS_AVAILABLE (1 << 8) | 70 | /* Partition Guest IDLE MSR is available */ |
| 86 | 71 | #define HV_X64_MSR_GUEST_IDLE_AVAILABLE BIT(10) | |
| 87 | /* Crash MSR available */ | 72 | /* |
| 88 | #define HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE (1 << 10) | 73 | * There is a single feature flag that signifies if the partition has access |
| 89 | 74 | * to MSRs with local APIC and TSC frequencies. | |
| 90 | /* stimer Direct Mode is available */ | 75 | */ |
| 91 | #define HV_STIMER_DIRECT_MODE_AVAILABLE (1 << 19) | 76 | #define HV_X64_ACCESS_FREQUENCY_MSRS BIT(11) |
| 77 | /* AccessReenlightenmentControls privilege */ | ||
| 78 | #define HV_X64_ACCESS_REENLIGHTENMENT BIT(13) | ||
| 92 | 79 | ||
| 93 | /* | 80 | /* |
| 94 | * Feature identification: EBX indicates which flags were specified at | 81 | * Feature identification: indicates which flags were specified at partition |
| 95 | * partition creation. The format is the same as the partition creation | 82 | * creation. The format is the same as the partition creation flag structure |
| 96 | * flag structure defined in section Partition Creation Flags. | 83 | * defined in section Partition Creation Flags. |
| 84 | * These are HYPERV_CPUID_FEATURES.EBX bits. | ||
| 97 | */ | 85 | */ |
| 98 | #define HV_X64_CREATE_PARTITIONS (1 << 0) | 86 | #define HV_X64_CREATE_PARTITIONS BIT(0) |
| 99 | #define HV_X64_ACCESS_PARTITION_ID (1 << 1) | 87 | #define HV_X64_ACCESS_PARTITION_ID BIT(1) |
| 100 | #define HV_X64_ACCESS_MEMORY_POOL (1 << 2) | 88 | #define HV_X64_ACCESS_MEMORY_POOL BIT(2) |
| 101 | #define HV_X64_ADJUST_MESSAGE_BUFFERS (1 << 3) | 89 | #define HV_X64_ADJUST_MESSAGE_BUFFERS BIT(3) |
| 102 | #define HV_X64_POST_MESSAGES (1 << 4) | 90 | #define HV_X64_POST_MESSAGES BIT(4) |
| 103 | #define HV_X64_SIGNAL_EVENTS (1 << 5) | 91 | #define HV_X64_SIGNAL_EVENTS BIT(5) |
| 104 | #define HV_X64_CREATE_PORT (1 << 6) | 92 | #define HV_X64_CREATE_PORT BIT(6) |
| 105 | #define HV_X64_CONNECT_PORT (1 << 7) | 93 | #define HV_X64_CONNECT_PORT BIT(7) |
| 106 | #define HV_X64_ACCESS_STATS (1 << 8) | 94 | #define HV_X64_ACCESS_STATS BIT(8) |
| 107 | #define HV_X64_DEBUGGING (1 << 11) | 95 | #define HV_X64_DEBUGGING BIT(11) |
| 108 | #define HV_X64_CPU_POWER_MANAGEMENT (1 << 12) | 96 | #define HV_X64_CPU_POWER_MANAGEMENT BIT(12) |
| 109 | #define HV_X64_CONFIGURE_PROFILER (1 << 13) | ||
| 110 | 97 | ||
| 111 | /* | 98 | /* |
| 112 | * Feature identification. EDX indicates which miscellaneous features | 99 | * Feature identification. EDX indicates which miscellaneous features |
| 113 | * are available to the partition. | 100 | * are available to the partition. |
| 101 | * These are HYPERV_CPUID_FEATURES.EDX bits. | ||
| 114 | */ | 102 | */ |
| 115 | /* The MWAIT instruction is available (per section MONITOR / MWAIT) */ | 103 | /* The MWAIT instruction is available (per section MONITOR / MWAIT) */ |
| 116 | #define HV_X64_MWAIT_AVAILABLE (1 << 0) | 104 | #define HV_X64_MWAIT_AVAILABLE BIT(0) |
| 117 | /* Guest debugging support is available */ | 105 | /* Guest debugging support is available */ |
| 118 | #define HV_X64_GUEST_DEBUGGING_AVAILABLE (1 << 1) | 106 | #define HV_X64_GUEST_DEBUGGING_AVAILABLE BIT(1) |
| 119 | /* Performance Monitor support is available*/ | 107 | /* Performance Monitor support is available*/ |
| 120 | #define HV_X64_PERF_MONITOR_AVAILABLE (1 << 2) | 108 | #define HV_X64_PERF_MONITOR_AVAILABLE BIT(2) |
| 121 | /* Support for physical CPU dynamic partitioning events is available*/ | 109 | /* Support for physical CPU dynamic partitioning events is available*/ |
| 122 | #define HV_X64_CPU_DYNAMIC_PARTITIONING_AVAILABLE (1 << 3) | 110 | #define HV_X64_CPU_DYNAMIC_PARTITIONING_AVAILABLE BIT(3) |
| 123 | /* | 111 | /* |
| 124 | * Support for passing hypercall input parameter block via XMM | 112 | * Support for passing hypercall input parameter block via XMM |
| 125 | * registers is available | 113 | * registers is available |
| 126 | */ | 114 | */ |
| 127 | #define HV_X64_HYPERCALL_PARAMS_XMM_AVAILABLE (1 << 4) | 115 | #define HV_X64_HYPERCALL_PARAMS_XMM_AVAILABLE BIT(4) |
| 128 | /* Support for a virtual guest idle state is available */ | 116 | /* Support for a virtual guest idle state is available */ |
| 129 | #define HV_X64_GUEST_IDLE_STATE_AVAILABLE (1 << 5) | 117 | #define HV_X64_GUEST_IDLE_STATE_AVAILABLE BIT(5) |
| 130 | /* Guest crash data handler available */ | 118 | /* Frequency MSRs available */ |
| 131 | #define HV_X64_GUEST_CRASH_MSR_AVAILABLE (1 << 10) | 119 | #define HV_FEATURE_FREQUENCY_MSRS_AVAILABLE BIT(8) |
| 120 | /* Crash MSR available */ | ||
| 121 | #define HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE BIT(10) | ||
| 122 | /* stimer Direct Mode is available */ | ||
| 123 | #define HV_STIMER_DIRECT_MODE_AVAILABLE BIT(19) | ||
| 132 | 124 | ||
| 133 | /* | 125 | /* |
| 134 | * Implementation recommendations. Indicates which behaviors the hypervisor | 126 | * Implementation recommendations. Indicates which behaviors the hypervisor |
| 135 | * recommends the OS implement for optimal performance. | 127 | * recommends the OS implement for optimal performance. |
| 128 | * These are HYPERV_CPUID_ENLIGHTMENT_INFO.EAX bits. | ||
| 129 | */ | ||
| 130 | /* | ||
| 131 | * Recommend using hypercall for address space switches rather | ||
| 132 | * than MOV to CR3 instruction | ||
| 136 | */ | 133 | */ |
| 137 | /* | 134 | #define HV_X64_AS_SWITCH_RECOMMENDED BIT(0) |
| 138 | * Recommend using hypercall for address space switches rather | ||
| 139 | * than MOV to CR3 instruction | ||
| 140 | */ | ||
| 141 | #define HV_X64_AS_SWITCH_RECOMMENDED (1 << 0) | ||
| 142 | /* Recommend using hypercall for local TLB flushes rather | 135 | /* Recommend using hypercall for local TLB flushes rather |
| 143 | * than INVLPG or MOV to CR3 instructions */ | 136 | * than INVLPG or MOV to CR3 instructions */ |
| 144 | #define HV_X64_LOCAL_TLB_FLUSH_RECOMMENDED (1 << 1) | 137 | #define HV_X64_LOCAL_TLB_FLUSH_RECOMMENDED BIT(1) |
| 145 | /* | 138 | /* |
| 146 | * Recommend using hypercall for remote TLB flushes rather | 139 | * Recommend using hypercall for remote TLB flushes rather |
| 147 | * than inter-processor interrupts | 140 | * than inter-processor interrupts |
| 148 | */ | 141 | */ |
| 149 | #define HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED (1 << 2) | 142 | #define HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED BIT(2) |
| 150 | /* | 143 | /* |
| 151 | * Recommend using MSRs for accessing APIC registers | 144 | * Recommend using MSRs for accessing APIC registers |
| 152 | * EOI, ICR and TPR rather than their memory-mapped counterparts | 145 | * EOI, ICR and TPR rather than their memory-mapped counterparts |
| 153 | */ | 146 | */ |
| 154 | #define HV_X64_APIC_ACCESS_RECOMMENDED (1 << 3) | 147 | #define HV_X64_APIC_ACCESS_RECOMMENDED BIT(3) |
| 155 | /* Recommend using the hypervisor-provided MSR to initiate a system RESET */ | 148 | /* Recommend using the hypervisor-provided MSR to initiate a system RESET */ |
| 156 | #define HV_X64_SYSTEM_RESET_RECOMMENDED (1 << 4) | 149 | #define HV_X64_SYSTEM_RESET_RECOMMENDED BIT(4) |
| 157 | /* | 150 | /* |
| 158 | * Recommend using relaxed timing for this partition. If used, | 151 | * Recommend using relaxed timing for this partition. If used, |
| 159 | * the VM should disable any watchdog timeouts that rely on the | 152 | * the VM should disable any watchdog timeouts that rely on the |
| 160 | * timely delivery of external interrupts | 153 | * timely delivery of external interrupts |
| 161 | */ | 154 | */ |
| 162 | #define HV_X64_RELAXED_TIMING_RECOMMENDED (1 << 5) | 155 | #define HV_X64_RELAXED_TIMING_RECOMMENDED BIT(5) |
| 163 | 156 | ||
| 164 | /* | 157 | /* |
| 165 | * Recommend not using Auto End-Of-Interrupt feature | 158 | * Recommend not using Auto End-Of-Interrupt feature |
| 166 | */ | 159 | */ |
| 167 | #define HV_DEPRECATING_AEOI_RECOMMENDED (1 << 9) | 160 | #define HV_DEPRECATING_AEOI_RECOMMENDED BIT(9) |
| 168 | 161 | ||
| 169 | /* | 162 | /* |
| 170 | * Recommend using cluster IPI hypercalls. | 163 | * Recommend using cluster IPI hypercalls. |
| 171 | */ | 164 | */ |
| 172 | #define HV_X64_CLUSTER_IPI_RECOMMENDED (1 << 10) | 165 | #define HV_X64_CLUSTER_IPI_RECOMMENDED BIT(10) |
| 173 | 166 | ||
| 174 | /* Recommend using the newer ExProcessorMasks interface */ | 167 | /* Recommend using the newer ExProcessorMasks interface */ |
| 175 | #define HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED (1 << 11) | 168 | #define HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED BIT(11) |
| 176 | 169 | ||
| 177 | /* Recommend using enlightened VMCS */ | 170 | /* Recommend using enlightened VMCS */ |
| 178 | #define HV_X64_ENLIGHTENED_VMCS_RECOMMENDED (1 << 14) | 171 | #define HV_X64_ENLIGHTENED_VMCS_RECOMMENDED BIT(14) |
| 179 | 172 | ||
| 180 | /* | 173 | /* Nested features. These are HYPERV_CPUID_NESTED_FEATURES.EAX bits. */ |
| 181 | * Crash notification flags. | 174 | #define HV_X64_NESTED_GUEST_MAPPING_FLUSH BIT(18) |
| 182 | */ | 175 | #define HV_X64_NESTED_MSR_BITMAP BIT(19) |
| 183 | #define HV_CRASH_CTL_CRASH_NOTIFY_MSG BIT_ULL(62) | 176 | |
| 184 | #define HV_CRASH_CTL_CRASH_NOTIFY BIT_ULL(63) | 177 | /* Hyper-V specific model specific registers (MSRs) */ |
| 185 | 178 | ||
| 186 | /* MSR used to identify the guest OS. */ | 179 | /* MSR used to identify the guest OS. */ |
| 187 | #define HV_X64_MSR_GUEST_OS_ID 0x40000000 | 180 | #define HV_X64_MSR_GUEST_OS_ID 0x40000000 |
| @@ -201,6 +194,9 @@ | |||
| 201 | /* MSR used to read the per-partition time reference counter */ | 194 | /* MSR used to read the per-partition time reference counter */ |
| 202 | #define HV_X64_MSR_TIME_REF_COUNT 0x40000020 | 195 | #define HV_X64_MSR_TIME_REF_COUNT 0x40000020 |
| 203 | 196 | ||
| 197 | /* A partition's reference time stamp counter (TSC) page */ | ||
| 198 | #define HV_X64_MSR_REFERENCE_TSC 0x40000021 | ||
| 199 | |||
| 204 | /* MSR used to retrieve the TSC frequency */ | 200 | /* MSR used to retrieve the TSC frequency */ |
| 205 | #define HV_X64_MSR_TSC_FREQUENCY 0x40000022 | 201 | #define HV_X64_MSR_TSC_FREQUENCY 0x40000022 |
| 206 | 202 | ||
| @@ -258,9 +254,11 @@ | |||
| 258 | #define HV_X64_MSR_CRASH_P3 0x40000103 | 254 | #define HV_X64_MSR_CRASH_P3 0x40000103 |
| 259 | #define HV_X64_MSR_CRASH_P4 0x40000104 | 255 | #define HV_X64_MSR_CRASH_P4 0x40000104 |
| 260 | #define HV_X64_MSR_CRASH_CTL 0x40000105 | 256 | #define HV_X64_MSR_CRASH_CTL 0x40000105 |
| 261 | #define HV_X64_MSR_CRASH_CTL_NOTIFY (1ULL << 63) | 257 | |
| 262 | #define HV_X64_MSR_CRASH_PARAMS \ | 258 | /* TSC emulation after migration */ |
| 263 | (1 + (HV_X64_MSR_CRASH_P4 - HV_X64_MSR_CRASH_P0)) | 259 | #define HV_X64_MSR_REENLIGHTENMENT_CONTROL 0x40000106 |
| 260 | #define HV_X64_MSR_TSC_EMULATION_CONTROL 0x40000107 | ||
| 261 | #define HV_X64_MSR_TSC_EMULATION_STATUS 0x40000108 | ||
| 264 | 262 | ||
| 265 | /* | 263 | /* |
| 266 | * Declare the MSR used to setup pages used to communicate with the hypervisor. | 264 | * Declare the MSR used to setup pages used to communicate with the hypervisor. |
| @@ -271,7 +269,7 @@ union hv_x64_msr_hypercall_contents { | |||
| 271 | u64 enable:1; | 269 | u64 enable:1; |
| 272 | u64 reserved:11; | 270 | u64 reserved:11; |
| 273 | u64 guest_physical_address:52; | 271 | u64 guest_physical_address:52; |
| 274 | }; | 272 | } __packed; |
| 275 | }; | 273 | }; |
| 276 | 274 | ||
| 277 | /* | 275 | /* |
| @@ -283,7 +281,7 @@ struct ms_hyperv_tsc_page { | |||
| 283 | volatile u64 tsc_scale; | 281 | volatile u64 tsc_scale; |
| 284 | volatile s64 tsc_offset; | 282 | volatile s64 tsc_offset; |
| 285 | u64 reserved2[509]; | 283 | u64 reserved2[509]; |
| 286 | }; | 284 | } __packed; |
| 287 | 285 | ||
| 288 | /* | 286 | /* |
| 289 | * The guest OS needs to register the guest ID with the hypervisor. | 287 | * The guest OS needs to register the guest ID with the hypervisor. |
| @@ -311,39 +309,37 @@ struct ms_hyperv_tsc_page { | |||
| 311 | 309 | ||
| 312 | #define HV_LINUX_VENDOR_ID 0x8100 | 310 | #define HV_LINUX_VENDOR_ID 0x8100 |
| 313 | 311 | ||
| 314 | /* TSC emulation after migration */ | ||
| 315 | #define HV_X64_MSR_REENLIGHTENMENT_CONTROL 0x40000106 | ||
| 316 | |||
| 317 | /* Nested features (CPUID 0x4000000A) EAX */ | ||
| 318 | #define HV_X64_NESTED_GUEST_MAPPING_FLUSH BIT(18) | ||
| 319 | #define HV_X64_NESTED_MSR_BITMAP BIT(19) | ||
| 320 | |||
| 321 | struct hv_reenlightenment_control { | 312 | struct hv_reenlightenment_control { |
| 322 | __u64 vector:8; | 313 | __u64 vector:8; |
| 323 | __u64 reserved1:8; | 314 | __u64 reserved1:8; |
| 324 | __u64 enabled:1; | 315 | __u64 enabled:1; |
| 325 | __u64 reserved2:15; | 316 | __u64 reserved2:15; |
| 326 | __u64 target_vp:32; | 317 | __u64 target_vp:32; |
| 327 | }; | 318 | } __packed; |
| 328 | |||
| 329 | #define HV_X64_MSR_TSC_EMULATION_CONTROL 0x40000107 | ||
| 330 | #define HV_X64_MSR_TSC_EMULATION_STATUS 0x40000108 | ||
| 331 | 319 | ||
| 332 | struct hv_tsc_emulation_control { | 320 | struct hv_tsc_emulation_control { |
| 333 | __u64 enabled:1; | 321 | __u64 enabled:1; |
| 334 | __u64 reserved:63; | 322 | __u64 reserved:63; |
| 335 | }; | 323 | } __packed; |
| 336 | 324 | ||
| 337 | struct hv_tsc_emulation_status { | 325 | struct hv_tsc_emulation_status { |
| 338 | __u64 inprogress:1; | 326 | __u64 inprogress:1; |
| 339 | __u64 reserved:63; | 327 | __u64 reserved:63; |
| 340 | }; | 328 | } __packed; |
| 341 | 329 | ||
| 342 | #define HV_X64_MSR_HYPERCALL_ENABLE 0x00000001 | 330 | #define HV_X64_MSR_HYPERCALL_ENABLE 0x00000001 |
| 343 | #define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT 12 | 331 | #define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT 12 |
| 344 | #define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_MASK \ | 332 | #define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_MASK \ |
| 345 | (~((1ull << HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT) - 1)) | 333 | (~((1ull << HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT) - 1)) |
| 346 | 334 | ||
| 335 | /* | ||
| 336 | * Crash notification (HV_X64_MSR_CRASH_CTL) flags. | ||
| 337 | */ | ||
| 338 | #define HV_CRASH_CTL_CRASH_NOTIFY_MSG BIT_ULL(62) | ||
| 339 | #define HV_CRASH_CTL_CRASH_NOTIFY BIT_ULL(63) | ||
| 340 | #define HV_X64_MSR_CRASH_PARAMS \ | ||
| 341 | (1 + (HV_X64_MSR_CRASH_P4 - HV_X64_MSR_CRASH_P0)) | ||
| 342 | |||
| 347 | #define HV_IPI_LOW_VECTOR 0x10 | 343 | #define HV_IPI_LOW_VECTOR 0x10 |
| 348 | #define HV_IPI_HIGH_VECTOR 0xff | 344 | #define HV_IPI_HIGH_VECTOR 0xff |
| 349 | 345 | ||
| @@ -358,6 +354,7 @@ struct hv_tsc_emulation_status { | |||
| 358 | #define HVCALL_POST_MESSAGE 0x005c | 354 | #define HVCALL_POST_MESSAGE 0x005c |
| 359 | #define HVCALL_SIGNAL_EVENT 0x005d | 355 | #define HVCALL_SIGNAL_EVENT 0x005d |
| 360 | #define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE 0x00af | 356 | #define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE 0x00af |
| 357 | #define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_LIST 0x00b0 | ||
| 361 | 358 | ||
| 362 | #define HV_X64_MSR_VP_ASSIST_PAGE_ENABLE 0x00000001 | 359 | #define HV_X64_MSR_VP_ASSIST_PAGE_ENABLE 0x00000001 |
| 363 | #define HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT 12 | 360 | #define HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT 12 |
| @@ -409,7 +406,7 @@ typedef struct _HV_REFERENCE_TSC_PAGE { | |||
| 409 | __u32 res1; | 406 | __u32 res1; |
| 410 | __u64 tsc_scale; | 407 | __u64 tsc_scale; |
| 411 | __s64 tsc_offset; | 408 | __s64 tsc_offset; |
| 412 | } HV_REFERENCE_TSC_PAGE, *PHV_REFERENCE_TSC_PAGE; | 409 | } __packed HV_REFERENCE_TSC_PAGE, *PHV_REFERENCE_TSC_PAGE; |
| 413 | 410 | ||
| 414 | /* Define the number of synthetic interrupt sources. */ | 411 | /* Define the number of synthetic interrupt sources. */ |
| 415 | #define HV_SYNIC_SINT_COUNT (16) | 412 | #define HV_SYNIC_SINT_COUNT (16) |
| @@ -466,7 +463,7 @@ union hv_message_flags { | |||
| 466 | struct { | 463 | struct { |
| 467 | __u8 msg_pending:1; | 464 | __u8 msg_pending:1; |
| 468 | __u8 reserved:7; | 465 | __u8 reserved:7; |
| 469 | }; | 466 | } __packed; |
| 470 | }; | 467 | }; |
| 471 | 468 | ||
| 472 | /* Define port identifier type. */ | 469 | /* Define port identifier type. */ |
| @@ -475,7 +472,7 @@ union hv_port_id { | |||
| 475 | struct { | 472 | struct { |
| 476 | __u32 id:24; | 473 | __u32 id:24; |
| 477 | __u32 reserved:8; | 474 | __u32 reserved:8; |
| 478 | } u; | 475 | } __packed u; |
| 479 | }; | 476 | }; |
| 480 | 477 | ||
| 481 | /* Define synthetic interrupt controller message header. */ | 478 | /* Define synthetic interrupt controller message header. */ |
| @@ -488,7 +485,7 @@ struct hv_message_header { | |||
| 488 | __u64 sender; | 485 | __u64 sender; |
| 489 | union hv_port_id port; | 486 | union hv_port_id port; |
| 490 | }; | 487 | }; |
| 491 | }; | 488 | } __packed; |
| 492 | 489 | ||
| 493 | /* Define synthetic interrupt controller message format. */ | 490 | /* Define synthetic interrupt controller message format. */ |
| 494 | struct hv_message { | 491 | struct hv_message { |
| @@ -496,12 +493,12 @@ struct hv_message { | |||
| 496 | union { | 493 | union { |
| 497 | __u64 payload[HV_MESSAGE_PAYLOAD_QWORD_COUNT]; | 494 | __u64 payload[HV_MESSAGE_PAYLOAD_QWORD_COUNT]; |
| 498 | } u; | 495 | } u; |
| 499 | }; | 496 | } __packed; |
| 500 | 497 | ||
| 501 | /* Define the synthetic interrupt message page layout. */ | 498 | /* Define the synthetic interrupt message page layout. */ |
| 502 | struct hv_message_page { | 499 | struct hv_message_page { |
| 503 | struct hv_message sint_message[HV_SYNIC_SINT_COUNT]; | 500 | struct hv_message sint_message[HV_SYNIC_SINT_COUNT]; |
| 504 | }; | 501 | } __packed; |
| 505 | 502 | ||
| 506 | /* Define timer message payload structure. */ | 503 | /* Define timer message payload structure. */ |
| 507 | struct hv_timer_message_payload { | 504 | struct hv_timer_message_payload { |
| @@ -509,7 +506,7 @@ struct hv_timer_message_payload { | |||
| 509 | __u32 reserved; | 506 | __u32 reserved; |
| 510 | __u64 expiration_time; /* When the timer expired */ | 507 | __u64 expiration_time; /* When the timer expired */ |
| 511 | __u64 delivery_time; /* When the message was delivered */ | 508 | __u64 delivery_time; /* When the message was delivered */ |
| 512 | }; | 509 | } __packed; |
| 513 | 510 | ||
| 514 | /* Define virtual processor assist page structure. */ | 511 | /* Define virtual processor assist page structure. */ |
| 515 | struct hv_vp_assist_page { | 512 | struct hv_vp_assist_page { |
| @@ -518,8 +515,9 @@ struct hv_vp_assist_page { | |||
| 518 | __u64 vtl_control[2]; | 515 | __u64 vtl_control[2]; |
| 519 | __u64 nested_enlightenments_control[2]; | 516 | __u64 nested_enlightenments_control[2]; |
| 520 | __u32 enlighten_vmentry; | 517 | __u32 enlighten_vmentry; |
| 518 | __u32 padding; | ||
| 521 | __u64 current_nested_vmcs; | 519 | __u64 current_nested_vmcs; |
| 522 | }; | 520 | } __packed; |
| 523 | 521 | ||
| 524 | struct hv_enlightened_vmcs { | 522 | struct hv_enlightened_vmcs { |
| 525 | u32 revision_id; | 523 | u32 revision_id; |
| @@ -533,6 +531,8 @@ struct hv_enlightened_vmcs { | |||
| 533 | u16 host_gs_selector; | 531 | u16 host_gs_selector; |
| 534 | u16 host_tr_selector; | 532 | u16 host_tr_selector; |
| 535 | 533 | ||
| 534 | u16 padding16_1; | ||
| 535 | |||
| 536 | u64 host_ia32_pat; | 536 | u64 host_ia32_pat; |
| 537 | u64 host_ia32_efer; | 537 | u64 host_ia32_efer; |
| 538 | 538 | ||
| @@ -651,7 +651,7 @@ struct hv_enlightened_vmcs { | |||
| 651 | u64 ept_pointer; | 651 | u64 ept_pointer; |
| 652 | 652 | ||
| 653 | u16 virtual_processor_id; | 653 | u16 virtual_processor_id; |
| 654 | u16 padding16[3]; | 654 | u16 padding16_2[3]; |
| 655 | 655 | ||
| 656 | u64 padding64_2[5]; | 656 | u64 padding64_2[5]; |
| 657 | u64 guest_physical_address; | 657 | u64 guest_physical_address; |
| @@ -693,7 +693,7 @@ struct hv_enlightened_vmcs { | |||
| 693 | u32 nested_flush_hypercall:1; | 693 | u32 nested_flush_hypercall:1; |
| 694 | u32 msr_bitmap:1; | 694 | u32 msr_bitmap:1; |
| 695 | u32 reserved:30; | 695 | u32 reserved:30; |
| 696 | } hv_enlightenments_control; | 696 | } __packed hv_enlightenments_control; |
| 697 | u32 hv_vp_id; | 697 | u32 hv_vp_id; |
| 698 | 698 | ||
| 699 | u64 hv_vm_id; | 699 | u64 hv_vm_id; |
| @@ -703,7 +703,7 @@ struct hv_enlightened_vmcs { | |||
| 703 | u64 padding64_5[7]; | 703 | u64 padding64_5[7]; |
| 704 | u64 xss_exit_bitmap; | 704 | u64 xss_exit_bitmap; |
| 705 | u64 padding64_6[7]; | 705 | u64 padding64_6[7]; |
| 706 | }; | 706 | } __packed; |
| 707 | 707 | ||
| 708 | #define HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE 0 | 708 | #define HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE 0 |
| 709 | #define HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP BIT(0) | 709 | #define HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP BIT(0) |
| @@ -725,36 +725,129 @@ struct hv_enlightened_vmcs { | |||
| 725 | 725 | ||
| 726 | #define HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL 0xFFFF | 726 | #define HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL 0xFFFF |
| 727 | 727 | ||
| 728 | #define HV_STIMER_ENABLE (1ULL << 0) | 728 | /* Define synthetic interrupt controller flag constants. */ |
| 729 | #define HV_STIMER_PERIODIC (1ULL << 1) | 729 | #define HV_EVENT_FLAGS_COUNT (256 * 8) |
| 730 | #define HV_STIMER_LAZY (1ULL << 2) | 730 | #define HV_EVENT_FLAGS_LONG_COUNT (256 / sizeof(unsigned long)) |
| 731 | #define HV_STIMER_AUTOENABLE (1ULL << 3) | 731 | |
| 732 | #define HV_STIMER_SINT(config) (__u8)(((config) >> 16) & 0x0F) | 732 | /* |
| 733 | * Synthetic timer configuration. | ||
| 734 | */ | ||
| 735 | union hv_stimer_config { | ||
| 736 | u64 as_uint64; | ||
| 737 | struct { | ||
| 738 | u64 enable:1; | ||
| 739 | u64 periodic:1; | ||
| 740 | u64 lazy:1; | ||
| 741 | u64 auto_enable:1; | ||
| 742 | u64 apic_vector:8; | ||
| 743 | u64 direct_mode:1; | ||
| 744 | u64 reserved_z0:3; | ||
| 745 | u64 sintx:4; | ||
| 746 | u64 reserved_z1:44; | ||
| 747 | } __packed; | ||
| 748 | }; | ||
| 749 | |||
| 750 | |||
| 751 | /* Define the synthetic interrupt controller event flags format. */ | ||
| 752 | union hv_synic_event_flags { | ||
| 753 | unsigned long flags[HV_EVENT_FLAGS_LONG_COUNT]; | ||
| 754 | }; | ||
| 755 | |||
| 756 | /* Define SynIC control register. */ | ||
| 757 | union hv_synic_scontrol { | ||
| 758 | u64 as_uint64; | ||
| 759 | struct { | ||
| 760 | u64 enable:1; | ||
| 761 | u64 reserved:63; | ||
| 762 | } __packed; | ||
| 763 | }; | ||
| 764 | |||
| 765 | /* Define synthetic interrupt source. */ | ||
| 766 | union hv_synic_sint { | ||
| 767 | u64 as_uint64; | ||
| 768 | struct { | ||
| 769 | u64 vector:8; | ||
| 770 | u64 reserved1:8; | ||
| 771 | u64 masked:1; | ||
| 772 | u64 auto_eoi:1; | ||
| 773 | u64 reserved2:46; | ||
| 774 | } __packed; | ||
| 775 | }; | ||
| 776 | |||
| 777 | /* Define the format of the SIMP register */ | ||
| 778 | union hv_synic_simp { | ||
| 779 | u64 as_uint64; | ||
| 780 | struct { | ||
| 781 | u64 simp_enabled:1; | ||
| 782 | u64 preserved:11; | ||
| 783 | u64 base_simp_gpa:52; | ||
| 784 | } __packed; | ||
| 785 | }; | ||
| 786 | |||
| 787 | /* Define the format of the SIEFP register */ | ||
| 788 | union hv_synic_siefp { | ||
| 789 | u64 as_uint64; | ||
| 790 | struct { | ||
| 791 | u64 siefp_enabled:1; | ||
| 792 | u64 preserved:11; | ||
| 793 | u64 base_siefp_gpa:52; | ||
| 794 | } __packed; | ||
| 795 | }; | ||
| 733 | 796 | ||
| 734 | struct hv_vpset { | 797 | struct hv_vpset { |
| 735 | u64 format; | 798 | u64 format; |
| 736 | u64 valid_bank_mask; | 799 | u64 valid_bank_mask; |
| 737 | u64 bank_contents[]; | 800 | u64 bank_contents[]; |
| 738 | }; | 801 | } __packed; |
| 739 | 802 | ||
| 740 | /* HvCallSendSyntheticClusterIpi hypercall */ | 803 | /* HvCallSendSyntheticClusterIpi hypercall */ |
| 741 | struct hv_send_ipi { | 804 | struct hv_send_ipi { |
| 742 | u32 vector; | 805 | u32 vector; |
| 743 | u32 reserved; | 806 | u32 reserved; |
| 744 | u64 cpu_mask; | 807 | u64 cpu_mask; |
| 745 | }; | 808 | } __packed; |
| 746 | 809 | ||
| 747 | /* HvCallSendSyntheticClusterIpiEx hypercall */ | 810 | /* HvCallSendSyntheticClusterIpiEx hypercall */ |
| 748 | struct hv_send_ipi_ex { | 811 | struct hv_send_ipi_ex { |
| 749 | u32 vector; | 812 | u32 vector; |
| 750 | u32 reserved; | 813 | u32 reserved; |
| 751 | struct hv_vpset vp_set; | 814 | struct hv_vpset vp_set; |
| 752 | }; | 815 | } __packed; |
| 753 | 816 | ||
| 754 | /* HvFlushGuestPhysicalAddressSpace hypercalls */ | 817 | /* HvFlushGuestPhysicalAddressSpace hypercalls */ |
| 755 | struct hv_guest_mapping_flush { | 818 | struct hv_guest_mapping_flush { |
| 756 | u64 address_space; | 819 | u64 address_space; |
| 757 | u64 flags; | 820 | u64 flags; |
| 821 | } __packed; | ||
| 822 | |||
| 823 | /* | ||
| 824 | * HV_MAX_FLUSH_PAGES = "additional_pages" + 1. It's limited | ||
| 825 | * by the bitwidth of "additional_pages" in union hv_gpa_page_range. | ||
| 826 | */ | ||
| 827 | #define HV_MAX_FLUSH_PAGES (2048) | ||
| 828 | |||
| 829 | /* HvFlushGuestPhysicalAddressList hypercall */ | ||
| 830 | union hv_gpa_page_range { | ||
| 831 | u64 address_space; | ||
| 832 | struct { | ||
| 833 | u64 additional_pages:11; | ||
| 834 | u64 largepage:1; | ||
| 835 | u64 basepfn:52; | ||
| 836 | } page; | ||
| 837 | }; | ||
| 838 | |||
| 839 | /* | ||
| 840 | * All input flush parameters should be in single page. The max flush | ||
| 841 | * count is equal with how many entries of union hv_gpa_page_range can | ||
| 842 | * be populated into the input parameter page. | ||
| 843 | */ | ||
| 844 | #define HV_MAX_FLUSH_REP_COUNT (PAGE_SIZE - 2 * sizeof(u64) / \ | ||
| 845 | sizeof(union hv_gpa_page_range)) | ||
| 846 | |||
| 847 | struct hv_guest_mapping_flush_list { | ||
| 848 | u64 address_space; | ||
| 849 | u64 flags; | ||
| 850 | union hv_gpa_page_range gpa_list[HV_MAX_FLUSH_REP_COUNT]; | ||
| 758 | }; | 851 | }; |
| 759 | 852 | ||
| 760 | /* HvFlushVirtualAddressSpace, HvFlushVirtualAddressList hypercalls */ | 853 | /* HvFlushVirtualAddressSpace, HvFlushVirtualAddressList hypercalls */ |
| @@ -763,7 +856,7 @@ struct hv_tlb_flush { | |||
| 763 | u64 flags; | 856 | u64 flags; |
| 764 | u64 processor_mask; | 857 | u64 processor_mask; |
| 765 | u64 gva_list[]; | 858 | u64 gva_list[]; |
| 766 | }; | 859 | } __packed; |
| 767 | 860 | ||
| 768 | /* HvFlushVirtualAddressSpaceEx, HvFlushVirtualAddressListEx hypercalls */ | 861 | /* HvFlushVirtualAddressSpaceEx, HvFlushVirtualAddressListEx hypercalls */ |
| 769 | struct hv_tlb_flush_ex { | 862 | struct hv_tlb_flush_ex { |
| @@ -771,6 +864,6 @@ struct hv_tlb_flush_ex { | |||
| 771 | u64 flags; | 864 | u64 flags; |
| 772 | struct hv_vpset hv_vp_set; | 865 | struct hv_vpset hv_vp_set; |
| 773 | u64 gva_list[]; | 866 | u64 gva_list[]; |
| 774 | }; | 867 | } __packed; |
| 775 | 868 | ||
| 776 | #endif | 869 | #endif |
diff --git a/arch/x86/include/asm/intel_pt.h b/arch/x86/include/asm/intel_pt.h index b523f51c5400..634f99b1dc22 100644 --- a/arch/x86/include/asm/intel_pt.h +++ b/arch/x86/include/asm/intel_pt.h | |||
| @@ -2,10 +2,36 @@ | |||
| 2 | #ifndef _ASM_X86_INTEL_PT_H | 2 | #ifndef _ASM_X86_INTEL_PT_H |
| 3 | #define _ASM_X86_INTEL_PT_H | 3 | #define _ASM_X86_INTEL_PT_H |
| 4 | 4 | ||
| 5 | #define PT_CPUID_LEAVES 2 | ||
| 6 | #define PT_CPUID_REGS_NUM 4 /* number of regsters (eax, ebx, ecx, edx) */ | ||
| 7 | |||
| 8 | enum pt_capabilities { | ||
| 9 | PT_CAP_max_subleaf = 0, | ||
| 10 | PT_CAP_cr3_filtering, | ||
| 11 | PT_CAP_psb_cyc, | ||
| 12 | PT_CAP_ip_filtering, | ||
| 13 | PT_CAP_mtc, | ||
| 14 | PT_CAP_ptwrite, | ||
| 15 | PT_CAP_power_event_trace, | ||
| 16 | PT_CAP_topa_output, | ||
| 17 | PT_CAP_topa_multiple_entries, | ||
| 18 | PT_CAP_single_range_output, | ||
| 19 | PT_CAP_output_subsys, | ||
| 20 | PT_CAP_payloads_lip, | ||
| 21 | PT_CAP_num_address_ranges, | ||
| 22 | PT_CAP_mtc_periods, | ||
| 23 | PT_CAP_cycle_thresholds, | ||
| 24 | PT_CAP_psb_periods, | ||
| 25 | }; | ||
| 26 | |||
| 5 | #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL) | 27 | #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL) |
| 6 | void cpu_emergency_stop_pt(void); | 28 | void cpu_emergency_stop_pt(void); |
| 29 | extern u32 intel_pt_validate_hw_cap(enum pt_capabilities cap); | ||
| 30 | extern u32 intel_pt_validate_cap(u32 *caps, enum pt_capabilities cap); | ||
| 7 | #else | 31 | #else |
| 8 | static inline void cpu_emergency_stop_pt(void) {} | 32 | static inline void cpu_emergency_stop_pt(void) {} |
| 33 | static inline u32 intel_pt_validate_hw_cap(enum pt_capabilities cap) { return 0; } | ||
| 34 | static inline u32 intel_pt_validate_cap(u32 *caps, enum pt_capabilities capability) { return 0; } | ||
| 9 | #endif | 35 | #endif |
| 10 | 36 | ||
| 11 | #endif /* _ASM_X86_INTEL_PT_H */ | 37 | #endif /* _ASM_X86_INTEL_PT_H */ |
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index fbda5a917c5b..4660ce90de7f 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h | |||
| @@ -439,6 +439,11 @@ struct kvm_mmu { | |||
| 439 | u64 pdptrs[4]; /* pae */ | 439 | u64 pdptrs[4]; /* pae */ |
| 440 | }; | 440 | }; |
| 441 | 441 | ||
| 442 | struct kvm_tlb_range { | ||
| 443 | u64 start_gfn; | ||
| 444 | u64 pages; | ||
| 445 | }; | ||
| 446 | |||
| 442 | enum pmc_type { | 447 | enum pmc_type { |
| 443 | KVM_PMC_GP = 0, | 448 | KVM_PMC_GP = 0, |
| 444 | KVM_PMC_FIXED, | 449 | KVM_PMC_FIXED, |
| @@ -497,7 +502,7 @@ struct kvm_mtrr { | |||
| 497 | struct kvm_vcpu_hv_stimer { | 502 | struct kvm_vcpu_hv_stimer { |
| 498 | struct hrtimer timer; | 503 | struct hrtimer timer; |
| 499 | int index; | 504 | int index; |
| 500 | u64 config; | 505 | union hv_stimer_config config; |
| 501 | u64 count; | 506 | u64 count; |
| 502 | u64 exp_time; | 507 | u64 exp_time; |
| 503 | struct hv_message msg; | 508 | struct hv_message msg; |
| @@ -601,17 +606,16 @@ struct kvm_vcpu_arch { | |||
| 601 | 606 | ||
| 602 | /* | 607 | /* |
| 603 | * QEMU userspace and the guest each have their own FPU state. | 608 | * QEMU userspace and the guest each have their own FPU state. |
| 604 | * In vcpu_run, we switch between the user and guest FPU contexts. | 609 | * In vcpu_run, we switch between the user, maintained in the |
| 605 | * While running a VCPU, the VCPU thread will have the guest FPU | 610 | * task_struct struct, and guest FPU contexts. While running a VCPU, |
| 606 | * context. | 611 | * the VCPU thread will have the guest FPU context. |
| 607 | * | 612 | * |
| 608 | * Note that while the PKRU state lives inside the fpu registers, | 613 | * Note that while the PKRU state lives inside the fpu registers, |
| 609 | * it is switched out separately at VMENTER and VMEXIT time. The | 614 | * it is switched out separately at VMENTER and VMEXIT time. The |
| 610 | * "guest_fpu" state here contains the guest FPU context, with the | 615 | * "guest_fpu" state here contains the guest FPU context, with the |
| 611 | * host PRKU bits. | 616 | * host PRKU bits. |
| 612 | */ | 617 | */ |
| 613 | struct fpu user_fpu; | 618 | struct fpu *guest_fpu; |
| 614 | struct fpu guest_fpu; | ||
| 615 | 619 | ||
| 616 | u64 xcr0; | 620 | u64 xcr0; |
| 617 | u64 guest_supported_xcr0; | 621 | u64 guest_supported_xcr0; |
| @@ -1042,6 +1046,8 @@ struct kvm_x86_ops { | |||
| 1042 | 1046 | ||
| 1043 | void (*tlb_flush)(struct kvm_vcpu *vcpu, bool invalidate_gpa); | 1047 | void (*tlb_flush)(struct kvm_vcpu *vcpu, bool invalidate_gpa); |
| 1044 | int (*tlb_remote_flush)(struct kvm *kvm); | 1048 | int (*tlb_remote_flush)(struct kvm *kvm); |
| 1049 | int (*tlb_remote_flush_with_range)(struct kvm *kvm, | ||
| 1050 | struct kvm_tlb_range *range); | ||
| 1045 | 1051 | ||
| 1046 | /* | 1052 | /* |
| 1047 | * Flush any TLB entries associated with the given GVA. | 1053 | * Flush any TLB entries associated with the given GVA. |
| @@ -1106,6 +1112,7 @@ struct kvm_x86_ops { | |||
| 1106 | bool (*mpx_supported)(void); | 1112 | bool (*mpx_supported)(void); |
| 1107 | bool (*xsaves_supported)(void); | 1113 | bool (*xsaves_supported)(void); |
| 1108 | bool (*umip_emulated)(void); | 1114 | bool (*umip_emulated)(void); |
| 1115 | bool (*pt_supported)(void); | ||
| 1109 | 1116 | ||
| 1110 | int (*check_nested_events)(struct kvm_vcpu *vcpu, bool external_intr); | 1117 | int (*check_nested_events)(struct kvm_vcpu *vcpu, bool external_intr); |
| 1111 | void (*request_immediate_exit)(struct kvm_vcpu *vcpu); | 1118 | void (*request_immediate_exit)(struct kvm_vcpu *vcpu); |
| @@ -1186,6 +1193,7 @@ struct kvm_x86_ops { | |||
| 1186 | 1193 | ||
| 1187 | int (*nested_enable_evmcs)(struct kvm_vcpu *vcpu, | 1194 | int (*nested_enable_evmcs)(struct kvm_vcpu *vcpu, |
| 1188 | uint16_t *vmcs_version); | 1195 | uint16_t *vmcs_version); |
| 1196 | uint16_t (*nested_get_evmcs_version)(struct kvm_vcpu *vcpu); | ||
| 1189 | }; | 1197 | }; |
| 1190 | 1198 | ||
| 1191 | struct kvm_arch_async_pf { | 1199 | struct kvm_arch_async_pf { |
| @@ -1196,6 +1204,7 @@ struct kvm_arch_async_pf { | |||
| 1196 | }; | 1204 | }; |
| 1197 | 1205 | ||
| 1198 | extern struct kvm_x86_ops *kvm_x86_ops; | 1206 | extern struct kvm_x86_ops *kvm_x86_ops; |
| 1207 | extern struct kmem_cache *x86_fpu_cache; | ||
| 1199 | 1208 | ||
| 1200 | #define __KVM_HAVE_ARCH_VM_ALLOC | 1209 | #define __KVM_HAVE_ARCH_VM_ALLOC |
| 1201 | static inline struct kvm *kvm_arch_alloc_vm(void) | 1210 | static inline struct kvm *kvm_arch_alloc_vm(void) |
| @@ -1492,7 +1501,7 @@ asmlinkage void kvm_spurious_fault(void); | |||
| 1492 | "cmpb $0, kvm_rebooting \n\t" \ | 1501 | "cmpb $0, kvm_rebooting \n\t" \ |
| 1493 | "jne 668b \n\t" \ | 1502 | "jne 668b \n\t" \ |
| 1494 | __ASM_SIZE(push) " $666b \n\t" \ | 1503 | __ASM_SIZE(push) " $666b \n\t" \ |
| 1495 | "call kvm_spurious_fault \n\t" \ | 1504 | "jmp kvm_spurious_fault \n\t" \ |
| 1496 | ".popsection \n\t" \ | 1505 | ".popsection \n\t" \ |
| 1497 | _ASM_EXTABLE(666b, 667b) | 1506 | _ASM_EXTABLE(666b, 667b) |
| 1498 | 1507 | ||
| @@ -1503,7 +1512,7 @@ asmlinkage void kvm_spurious_fault(void); | |||
| 1503 | int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end); | 1512 | int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end); |
| 1504 | int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end); | 1513 | int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end); |
| 1505 | int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); | 1514 | int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); |
| 1506 | void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); | 1515 | int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); |
| 1507 | int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v); | 1516 | int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v); |
| 1508 | int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); | 1517 | int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); |
| 1509 | int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu); | 1518 | int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu); |
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index 1d0a7778e163..cc60e617931c 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h | |||
| @@ -22,6 +22,11 @@ struct ms_hyperv_info { | |||
| 22 | 22 | ||
| 23 | extern struct ms_hyperv_info ms_hyperv; | 23 | extern struct ms_hyperv_info ms_hyperv; |
| 24 | 24 | ||
| 25 | |||
| 26 | typedef int (*hyperv_fill_flush_list_func)( | ||
| 27 | struct hv_guest_mapping_flush_list *flush, | ||
| 28 | void *data); | ||
| 29 | |||
| 25 | /* | 30 | /* |
| 26 | * Generate the guest ID. | 31 | * Generate the guest ID. |
| 27 | */ | 32 | */ |
| @@ -348,6 +353,11 @@ void set_hv_tscchange_cb(void (*cb)(void)); | |||
| 348 | void clear_hv_tscchange_cb(void); | 353 | void clear_hv_tscchange_cb(void); |
| 349 | void hyperv_stop_tsc_emulation(void); | 354 | void hyperv_stop_tsc_emulation(void); |
| 350 | int hyperv_flush_guest_mapping(u64 as); | 355 | int hyperv_flush_guest_mapping(u64 as); |
| 356 | int hyperv_flush_guest_mapping_range(u64 as, | ||
| 357 | hyperv_fill_flush_list_func fill_func, void *data); | ||
| 358 | int hyperv_fill_flush_guest_mapping_list( | ||
| 359 | struct hv_guest_mapping_flush_list *flush, | ||
| 360 | u64 start_gfn, u64 end_gfn); | ||
| 351 | 361 | ||
| 352 | #ifdef CONFIG_X86_64 | 362 | #ifdef CONFIG_X86_64 |
| 353 | void hv_apic_init(void); | 363 | void hv_apic_init(void); |
| @@ -370,6 +380,11 @@ static inline struct hv_vp_assist_page *hv_get_vp_assist_page(unsigned int cpu) | |||
| 370 | return NULL; | 380 | return NULL; |
| 371 | } | 381 | } |
| 372 | static inline int hyperv_flush_guest_mapping(u64 as) { return -1; } | 382 | static inline int hyperv_flush_guest_mapping(u64 as) { return -1; } |
| 383 | static inline int hyperv_flush_guest_mapping_range(u64 as, | ||
| 384 | hyperv_fill_flush_list_func fill_func, void *data) | ||
| 385 | { | ||
| 386 | return -1; | ||
| 387 | } | ||
| 373 | #endif /* CONFIG_HYPERV */ | 388 | #endif /* CONFIG_HYPERV */ |
| 374 | 389 | ||
| 375 | #ifdef CONFIG_HYPERV_TSCPAGE | 390 | #ifdef CONFIG_HYPERV_TSCPAGE |
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 9e39cc8bd989..8e40c2446fd1 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h | |||
| @@ -121,7 +121,43 @@ | |||
| 121 | #define MSR_PEBS_LD_LAT_THRESHOLD 0x000003f6 | 121 | #define MSR_PEBS_LD_LAT_THRESHOLD 0x000003f6 |
| 122 | 122 | ||
| 123 | #define MSR_IA32_RTIT_CTL 0x00000570 | 123 | #define MSR_IA32_RTIT_CTL 0x00000570 |
| 124 | #define RTIT_CTL_TRACEEN BIT(0) | ||
| 125 | #define RTIT_CTL_CYCLEACC BIT(1) | ||
| 126 | #define RTIT_CTL_OS BIT(2) | ||
| 127 | #define RTIT_CTL_USR BIT(3) | ||
| 128 | #define RTIT_CTL_PWR_EVT_EN BIT(4) | ||
| 129 | #define RTIT_CTL_FUP_ON_PTW BIT(5) | ||
| 130 | #define RTIT_CTL_FABRIC_EN BIT(6) | ||
| 131 | #define RTIT_CTL_CR3EN BIT(7) | ||
| 132 | #define RTIT_CTL_TOPA BIT(8) | ||
| 133 | #define RTIT_CTL_MTC_EN BIT(9) | ||
| 134 | #define RTIT_CTL_TSC_EN BIT(10) | ||
| 135 | #define RTIT_CTL_DISRETC BIT(11) | ||
| 136 | #define RTIT_CTL_PTW_EN BIT(12) | ||
| 137 | #define RTIT_CTL_BRANCH_EN BIT(13) | ||
| 138 | #define RTIT_CTL_MTC_RANGE_OFFSET 14 | ||
| 139 | #define RTIT_CTL_MTC_RANGE (0x0full << RTIT_CTL_MTC_RANGE_OFFSET) | ||
| 140 | #define RTIT_CTL_CYC_THRESH_OFFSET 19 | ||
| 141 | #define RTIT_CTL_CYC_THRESH (0x0full << RTIT_CTL_CYC_THRESH_OFFSET) | ||
| 142 | #define RTIT_CTL_PSB_FREQ_OFFSET 24 | ||
| 143 | #define RTIT_CTL_PSB_FREQ (0x0full << RTIT_CTL_PSB_FREQ_OFFSET) | ||
| 144 | #define RTIT_CTL_ADDR0_OFFSET 32 | ||
| 145 | #define RTIT_CTL_ADDR0 (0x0full << RTIT_CTL_ADDR0_OFFSET) | ||
| 146 | #define RTIT_CTL_ADDR1_OFFSET 36 | ||
| 147 | #define RTIT_CTL_ADDR1 (0x0full << RTIT_CTL_ADDR1_OFFSET) | ||
| 148 | #define RTIT_CTL_ADDR2_OFFSET 40 | ||
| 149 | #define RTIT_CTL_ADDR2 (0x0full << RTIT_CTL_ADDR2_OFFSET) | ||
| 150 | #define RTIT_CTL_ADDR3_OFFSET 44 | ||
| 151 | #define RTIT_CTL_ADDR3 (0x0full << RTIT_CTL_ADDR3_OFFSET) | ||
| 124 | #define MSR_IA32_RTIT_STATUS 0x00000571 | 152 | #define MSR_IA32_RTIT_STATUS 0x00000571 |
| 153 | #define RTIT_STATUS_FILTEREN BIT(0) | ||
| 154 | #define RTIT_STATUS_CONTEXTEN BIT(1) | ||
| 155 | #define RTIT_STATUS_TRIGGEREN BIT(2) | ||
| 156 | #define RTIT_STATUS_BUFFOVF BIT(3) | ||
| 157 | #define RTIT_STATUS_ERROR BIT(4) | ||
| 158 | #define RTIT_STATUS_STOPPED BIT(5) | ||
| 159 | #define RTIT_STATUS_BYTECNT_OFFSET 32 | ||
| 160 | #define RTIT_STATUS_BYTECNT (0x1ffffull << RTIT_STATUS_BYTECNT_OFFSET) | ||
| 125 | #define MSR_IA32_RTIT_ADDR0_A 0x00000580 | 161 | #define MSR_IA32_RTIT_ADDR0_A 0x00000580 |
| 126 | #define MSR_IA32_RTIT_ADDR0_B 0x00000581 | 162 | #define MSR_IA32_RTIT_ADDR0_B 0x00000581 |
| 127 | #define MSR_IA32_RTIT_ADDR1_A 0x00000582 | 163 | #define MSR_IA32_RTIT_ADDR1_A 0x00000582 |
| @@ -772,6 +808,7 @@ | |||
| 772 | #define VMX_BASIC_INOUT 0x0040000000000000LLU | 808 | #define VMX_BASIC_INOUT 0x0040000000000000LLU |
| 773 | 809 | ||
| 774 | /* MSR_IA32_VMX_MISC bits */ | 810 | /* MSR_IA32_VMX_MISC bits */ |
| 811 | #define MSR_IA32_VMX_MISC_INTEL_PT (1ULL << 14) | ||
| 775 | #define MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS (1ULL << 29) | 812 | #define MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS (1ULL << 29) |
| 776 | #define MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE 0x1F | 813 | #define MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE 0x1F |
| 777 | /* AMD-V MSRs */ | 814 | /* AMD-V MSRs */ |
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index 93b462e48067..dec9c1e84c78 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h | |||
| @@ -290,11 +290,4 @@ struct __attribute__ ((__packed__)) vmcb { | |||
| 290 | 290 | ||
| 291 | #define SVM_CR0_SELECTIVE_MASK (X86_CR0_TS | X86_CR0_MP) | 291 | #define SVM_CR0_SELECTIVE_MASK (X86_CR0_TS | X86_CR0_MP) |
| 292 | 292 | ||
| 293 | #define SVM_VMLOAD ".byte 0x0f, 0x01, 0xda" | ||
| 294 | #define SVM_VMRUN ".byte 0x0f, 0x01, 0xd8" | ||
| 295 | #define SVM_VMSAVE ".byte 0x0f, 0x01, 0xdb" | ||
| 296 | #define SVM_CLGI ".byte 0x0f, 0x01, 0xdd" | ||
| 297 | #define SVM_STGI ".byte 0x0f, 0x01, 0xdc" | ||
| 298 | #define SVM_INVLPGA ".byte 0x0f, 0x01, 0xdf" | ||
| 299 | |||
| 300 | #endif | 293 | #endif |
diff --git a/arch/x86/include/asm/trace/hyperv.h b/arch/x86/include/asm/trace/hyperv.h index 2e6245a023ef..ace464f09681 100644 --- a/arch/x86/include/asm/trace/hyperv.h +++ b/arch/x86/include/asm/trace/hyperv.h | |||
| @@ -42,6 +42,20 @@ TRACE_EVENT(hyperv_nested_flush_guest_mapping, | |||
| 42 | TP_printk("address space %llx ret %d", __entry->as, __entry->ret) | 42 | TP_printk("address space %llx ret %d", __entry->as, __entry->ret) |
| 43 | ); | 43 | ); |
| 44 | 44 | ||
| 45 | TRACE_EVENT(hyperv_nested_flush_guest_mapping_range, | ||
| 46 | TP_PROTO(u64 as, int ret), | ||
| 47 | TP_ARGS(as, ret), | ||
| 48 | |||
| 49 | TP_STRUCT__entry( | ||
| 50 | __field(u64, as) | ||
| 51 | __field(int, ret) | ||
| 52 | ), | ||
| 53 | TP_fast_assign(__entry->as = as; | ||
| 54 | __entry->ret = ret; | ||
| 55 | ), | ||
| 56 | TP_printk("address space %llx ret %d", __entry->as, __entry->ret) | ||
| 57 | ); | ||
| 58 | |||
| 45 | TRACE_EVENT(hyperv_send_ipi_mask, | 59 | TRACE_EVENT(hyperv_send_ipi_mask, |
| 46 | TP_PROTO(const struct cpumask *cpus, | 60 | TP_PROTO(const struct cpumask *cpus, |
| 47 | int vector), | 61 | int vector), |
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index ade0f153947d..4e4133e86484 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h | |||
| @@ -77,7 +77,10 @@ | |||
| 77 | #define SECONDARY_EXEC_ENCLS_EXITING 0x00008000 | 77 | #define SECONDARY_EXEC_ENCLS_EXITING 0x00008000 |
| 78 | #define SECONDARY_EXEC_RDSEED_EXITING 0x00010000 | 78 | #define SECONDARY_EXEC_RDSEED_EXITING 0x00010000 |
| 79 | #define SECONDARY_EXEC_ENABLE_PML 0x00020000 | 79 | #define SECONDARY_EXEC_ENABLE_PML 0x00020000 |
| 80 | #define SECONDARY_EXEC_PT_CONCEAL_VMX 0x00080000 | ||
| 80 | #define SECONDARY_EXEC_XSAVES 0x00100000 | 81 | #define SECONDARY_EXEC_XSAVES 0x00100000 |
| 82 | #define SECONDARY_EXEC_PT_USE_GPA 0x01000000 | ||
| 83 | #define SECONDARY_EXEC_MODE_BASED_EPT_EXEC 0x00400000 | ||
| 81 | #define SECONDARY_EXEC_TSC_SCALING 0x02000000 | 84 | #define SECONDARY_EXEC_TSC_SCALING 0x02000000 |
| 82 | 85 | ||
| 83 | #define PIN_BASED_EXT_INTR_MASK 0x00000001 | 86 | #define PIN_BASED_EXT_INTR_MASK 0x00000001 |
| @@ -98,6 +101,8 @@ | |||
| 98 | #define VM_EXIT_LOAD_IA32_EFER 0x00200000 | 101 | #define VM_EXIT_LOAD_IA32_EFER 0x00200000 |
| 99 | #define VM_EXIT_SAVE_VMX_PREEMPTION_TIMER 0x00400000 | 102 | #define VM_EXIT_SAVE_VMX_PREEMPTION_TIMER 0x00400000 |
| 100 | #define VM_EXIT_CLEAR_BNDCFGS 0x00800000 | 103 | #define VM_EXIT_CLEAR_BNDCFGS 0x00800000 |
| 104 | #define VM_EXIT_PT_CONCEAL_PIP 0x01000000 | ||
| 105 | #define VM_EXIT_CLEAR_IA32_RTIT_CTL 0x02000000 | ||
| 101 | 106 | ||
| 102 | #define VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR 0x00036dff | 107 | #define VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR 0x00036dff |
| 103 | 108 | ||
| @@ -109,6 +114,8 @@ | |||
| 109 | #define VM_ENTRY_LOAD_IA32_PAT 0x00004000 | 114 | #define VM_ENTRY_LOAD_IA32_PAT 0x00004000 |
| 110 | #define VM_ENTRY_LOAD_IA32_EFER 0x00008000 | 115 | #define VM_ENTRY_LOAD_IA32_EFER 0x00008000 |
| 111 | #define VM_ENTRY_LOAD_BNDCFGS 0x00010000 | 116 | #define VM_ENTRY_LOAD_BNDCFGS 0x00010000 |
| 117 | #define VM_ENTRY_PT_CONCEAL_PIP 0x00020000 | ||
| 118 | #define VM_ENTRY_LOAD_IA32_RTIT_CTL 0x00040000 | ||
| 112 | 119 | ||
| 113 | #define VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR 0x000011ff | 120 | #define VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR 0x000011ff |
| 114 | 121 | ||
| @@ -240,6 +247,8 @@ enum vmcs_field { | |||
| 240 | GUEST_PDPTR3_HIGH = 0x00002811, | 247 | GUEST_PDPTR3_HIGH = 0x00002811, |
| 241 | GUEST_BNDCFGS = 0x00002812, | 248 | GUEST_BNDCFGS = 0x00002812, |
| 242 | GUEST_BNDCFGS_HIGH = 0x00002813, | 249 | GUEST_BNDCFGS_HIGH = 0x00002813, |
| 250 | GUEST_IA32_RTIT_CTL = 0x00002814, | ||
| 251 | GUEST_IA32_RTIT_CTL_HIGH = 0x00002815, | ||
| 243 | HOST_IA32_PAT = 0x00002c00, | 252 | HOST_IA32_PAT = 0x00002c00, |
| 244 | HOST_IA32_PAT_HIGH = 0x00002c01, | 253 | HOST_IA32_PAT_HIGH = 0x00002c01, |
| 245 | HOST_IA32_EFER = 0x00002c02, | 254 | HOST_IA32_EFER = 0x00002c02, |
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 30084ecaa20f..e811d4d1c824 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c | |||
| @@ -1,19 +1,6 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-or-later | ||
| 1 | /* KVM paravirtual clock driver. A clocksource implementation | 2 | /* KVM paravirtual clock driver. A clocksource implementation |
| 2 | Copyright (C) 2008 Glauber de Oliveira Costa, Red Hat Inc. | 3 | Copyright (C) 2008 Glauber de Oliveira Costa, Red Hat Inc. |
| 3 | |||
| 4 | This program is free software; you can redistribute it and/or modify | ||
| 5 | it under the terms of the GNU General Public License as published by | ||
| 6 | the Free Software Foundation; either version 2 of the License, or | ||
| 7 | (at your option) any later version. | ||
| 8 | |||
| 9 | This program is distributed in the hope that it will be useful, | ||
| 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
| 12 | GNU General Public License for more details. | ||
| 13 | |||
| 14 | You should have received a copy of the GNU General Public License | ||
| 15 | along with this program; if not, write to the Free Software | ||
| 16 | Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA | ||
| 17 | */ | 4 | */ |
| 18 | 5 | ||
| 19 | #include <linux/clocksource.h> | 6 | #include <linux/clocksource.h> |
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index dc4f2fdf5e57..69b3a7c30013 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile | |||
| @@ -16,7 +16,7 @@ kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ | |||
| 16 | i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \ | 16 | i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \ |
| 17 | hyperv.o page_track.o debugfs.o | 17 | hyperv.o page_track.o debugfs.o |
| 18 | 18 | ||
| 19 | kvm-intel-y += vmx.o pmu_intel.o | 19 | kvm-intel-y += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o vmx/evmcs.o vmx/nested.o |
| 20 | kvm-amd-y += svm.o pmu_amd.o | 20 | kvm-amd-y += svm.o pmu_amd.o |
| 21 | 21 | ||
| 22 | obj-$(CONFIG_KVM) += kvm.o | 22 | obj-$(CONFIG_KVM) += kvm.o |
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 7bcfa61375c0..bbffa6c54697 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c | |||
| @@ -67,9 +67,6 @@ u64 kvm_supported_xcr0(void) | |||
| 67 | 67 | ||
| 68 | #define F(x) bit(X86_FEATURE_##x) | 68 | #define F(x) bit(X86_FEATURE_##x) |
| 69 | 69 | ||
| 70 | /* For scattered features from cpufeatures.h; we currently expose none */ | ||
| 71 | #define KF(x) bit(KVM_CPUID_BIT_##x) | ||
| 72 | |||
| 73 | int kvm_update_cpuid(struct kvm_vcpu *vcpu) | 70 | int kvm_update_cpuid(struct kvm_vcpu *vcpu) |
| 74 | { | 71 | { |
| 75 | struct kvm_cpuid_entry2 *best; | 72 | struct kvm_cpuid_entry2 *best; |
| @@ -337,6 +334,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, | |||
| 337 | unsigned f_mpx = kvm_mpx_supported() ? F(MPX) : 0; | 334 | unsigned f_mpx = kvm_mpx_supported() ? F(MPX) : 0; |
| 338 | unsigned f_xsaves = kvm_x86_ops->xsaves_supported() ? F(XSAVES) : 0; | 335 | unsigned f_xsaves = kvm_x86_ops->xsaves_supported() ? F(XSAVES) : 0; |
| 339 | unsigned f_umip = kvm_x86_ops->umip_emulated() ? F(UMIP) : 0; | 336 | unsigned f_umip = kvm_x86_ops->umip_emulated() ? F(UMIP) : 0; |
| 337 | unsigned f_intel_pt = kvm_x86_ops->pt_supported() ? F(INTEL_PT) : 0; | ||
| 340 | 338 | ||
| 341 | /* cpuid 1.edx */ | 339 | /* cpuid 1.edx */ |
| 342 | const u32 kvm_cpuid_1_edx_x86_features = | 340 | const u32 kvm_cpuid_1_edx_x86_features = |
| @@ -380,8 +378,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, | |||
| 380 | 378 | ||
| 381 | /* cpuid 0x80000008.ebx */ | 379 | /* cpuid 0x80000008.ebx */ |
| 382 | const u32 kvm_cpuid_8000_0008_ebx_x86_features = | 380 | const u32 kvm_cpuid_8000_0008_ebx_x86_features = |
| 383 | F(AMD_IBPB) | F(AMD_IBRS) | F(AMD_SSBD) | F(VIRT_SSBD) | | 381 | F(WBNOINVD) | F(AMD_IBPB) | F(AMD_IBRS) | F(AMD_SSBD) | F(VIRT_SSBD) | |
| 384 | F(AMD_SSB_NO); | 382 | F(AMD_SSB_NO) | F(AMD_STIBP); |
| 385 | 383 | ||
| 386 | /* cpuid 0xC0000001.edx */ | 384 | /* cpuid 0xC0000001.edx */ |
| 387 | const u32 kvm_cpuid_C000_0001_edx_x86_features = | 385 | const u32 kvm_cpuid_C000_0001_edx_x86_features = |
| @@ -395,7 +393,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, | |||
| 395 | F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) | | 393 | F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) | |
| 396 | F(ADX) | F(SMAP) | F(AVX512IFMA) | F(AVX512F) | F(AVX512PF) | | 394 | F(ADX) | F(SMAP) | F(AVX512IFMA) | F(AVX512F) | F(AVX512PF) | |
| 397 | F(AVX512ER) | F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(AVX512DQ) | | 395 | F(AVX512ER) | F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(AVX512DQ) | |
| 398 | F(SHA_NI) | F(AVX512BW) | F(AVX512VL); | 396 | F(SHA_NI) | F(AVX512BW) | F(AVX512VL) | f_intel_pt; |
| 399 | 397 | ||
| 400 | /* cpuid 0xD.1.eax */ | 398 | /* cpuid 0xD.1.eax */ |
| 401 | const u32 kvm_cpuid_D_1_eax_x86_features = | 399 | const u32 kvm_cpuid_D_1_eax_x86_features = |
| @@ -411,7 +409,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, | |||
| 411 | /* cpuid 7.0.edx*/ | 409 | /* cpuid 7.0.edx*/ |
| 412 | const u32 kvm_cpuid_7_0_edx_x86_features = | 410 | const u32 kvm_cpuid_7_0_edx_x86_features = |
| 413 | F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) | | 411 | F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) | |
| 414 | F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES); | 412 | F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP); |
| 415 | 413 | ||
| 416 | /* all calls to cpuid_count() should be made on the same cpu */ | 414 | /* all calls to cpuid_count() should be made on the same cpu */ |
| 417 | get_cpu(); | 415 | get_cpu(); |
| @@ -426,7 +424,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, | |||
| 426 | 424 | ||
| 427 | switch (function) { | 425 | switch (function) { |
| 428 | case 0: | 426 | case 0: |
| 429 | entry->eax = min(entry->eax, (u32)0xd); | 427 | entry->eax = min(entry->eax, (u32)(f_intel_pt ? 0x14 : 0xd)); |
| 430 | break; | 428 | break; |
| 431 | case 1: | 429 | case 1: |
| 432 | entry->edx &= kvm_cpuid_1_edx_x86_features; | 430 | entry->edx &= kvm_cpuid_1_edx_x86_features; |
| @@ -603,6 +601,23 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, | |||
| 603 | } | 601 | } |
| 604 | break; | 602 | break; |
| 605 | } | 603 | } |
| 604 | /* Intel PT */ | ||
| 605 | case 0x14: { | ||
| 606 | int t, times = entry->eax; | ||
| 607 | |||
| 608 | if (!f_intel_pt) | ||
| 609 | break; | ||
| 610 | |||
| 611 | entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; | ||
| 612 | for (t = 1; t <= times; ++t) { | ||
| 613 | if (*nent >= maxnent) | ||
| 614 | goto out; | ||
| 615 | do_cpuid_1_ent(&entry[t], function, t); | ||
| 616 | entry[t].flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; | ||
| 617 | ++*nent; | ||
| 618 | } | ||
| 619 | break; | ||
| 620 | } | ||
| 606 | case KVM_CPUID_SIGNATURE: { | 621 | case KVM_CPUID_SIGNATURE: { |
| 607 | static const char signature[12] = "KVMKVMKVM\0\0"; | 622 | static const char signature[12] = "KVMKVMKVM\0\0"; |
| 608 | const u32 *sigptr = (const u32 *)signature; | 623 | const u32 *sigptr = (const u32 *)signature; |
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 4e80080f277a..c90a5352d158 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c | |||
| @@ -38,6 +38,9 @@ | |||
| 38 | 38 | ||
| 39 | #define KVM_HV_MAX_SPARSE_VCPU_SET_BITS DIV_ROUND_UP(KVM_MAX_VCPUS, 64) | 39 | #define KVM_HV_MAX_SPARSE_VCPU_SET_BITS DIV_ROUND_UP(KVM_MAX_VCPUS, 64) |
| 40 | 40 | ||
| 41 | static void stimer_mark_pending(struct kvm_vcpu_hv_stimer *stimer, | ||
| 42 | bool vcpu_kick); | ||
| 43 | |||
| 41 | static inline u64 synic_read_sint(struct kvm_vcpu_hv_synic *synic, int sint) | 44 | static inline u64 synic_read_sint(struct kvm_vcpu_hv_synic *synic, int sint) |
| 42 | { | 45 | { |
| 43 | return atomic64_read(&synic->sint[sint]); | 46 | return atomic64_read(&synic->sint[sint]); |
| @@ -158,59 +161,24 @@ static struct kvm_vcpu_hv_synic *synic_get(struct kvm *kvm, u32 vpidx) | |||
| 158 | return (synic->active) ? synic : NULL; | 161 | return (synic->active) ? synic : NULL; |
| 159 | } | 162 | } |
| 160 | 163 | ||
| 161 | static void synic_clear_sint_msg_pending(struct kvm_vcpu_hv_synic *synic, | ||
| 162 | u32 sint) | ||
| 163 | { | ||
| 164 | struct kvm_vcpu *vcpu = synic_to_vcpu(synic); | ||
| 165 | struct page *page; | ||
| 166 | gpa_t gpa; | ||
| 167 | struct hv_message *msg; | ||
| 168 | struct hv_message_page *msg_page; | ||
| 169 | |||
| 170 | gpa = synic->msg_page & PAGE_MASK; | ||
| 171 | page = kvm_vcpu_gfn_to_page(vcpu, gpa >> PAGE_SHIFT); | ||
| 172 | if (is_error_page(page)) { | ||
| 173 | vcpu_err(vcpu, "Hyper-V SynIC can't get msg page, gpa 0x%llx\n", | ||
| 174 | gpa); | ||
| 175 | return; | ||
| 176 | } | ||
| 177 | msg_page = kmap_atomic(page); | ||
| 178 | |||
| 179 | msg = &msg_page->sint_message[sint]; | ||
| 180 | msg->header.message_flags.msg_pending = 0; | ||
| 181 | |||
| 182 | kunmap_atomic(msg_page); | ||
| 183 | kvm_release_page_dirty(page); | ||
| 184 | kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT); | ||
| 185 | } | ||
| 186 | |||
| 187 | static void kvm_hv_notify_acked_sint(struct kvm_vcpu *vcpu, u32 sint) | 164 | static void kvm_hv_notify_acked_sint(struct kvm_vcpu *vcpu, u32 sint) |
| 188 | { | 165 | { |
| 189 | struct kvm *kvm = vcpu->kvm; | 166 | struct kvm *kvm = vcpu->kvm; |
| 190 | struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu); | 167 | struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu); |
| 191 | struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu); | 168 | struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu); |
| 192 | struct kvm_vcpu_hv_stimer *stimer; | 169 | struct kvm_vcpu_hv_stimer *stimer; |
| 193 | int gsi, idx, stimers_pending; | 170 | int gsi, idx; |
| 194 | 171 | ||
| 195 | trace_kvm_hv_notify_acked_sint(vcpu->vcpu_id, sint); | 172 | trace_kvm_hv_notify_acked_sint(vcpu->vcpu_id, sint); |
| 196 | 173 | ||
| 197 | if (synic->msg_page & HV_SYNIC_SIMP_ENABLE) | ||
| 198 | synic_clear_sint_msg_pending(synic, sint); | ||
| 199 | |||
| 200 | /* Try to deliver pending Hyper-V SynIC timers messages */ | 174 | /* Try to deliver pending Hyper-V SynIC timers messages */ |
| 201 | stimers_pending = 0; | ||
| 202 | for (idx = 0; idx < ARRAY_SIZE(hv_vcpu->stimer); idx++) { | 175 | for (idx = 0; idx < ARRAY_SIZE(hv_vcpu->stimer); idx++) { |
| 203 | stimer = &hv_vcpu->stimer[idx]; | 176 | stimer = &hv_vcpu->stimer[idx]; |
| 204 | if (stimer->msg_pending && | 177 | if (stimer->msg_pending && stimer->config.enable && |
| 205 | (stimer->config & HV_STIMER_ENABLE) && | 178 | !stimer->config.direct_mode && |
| 206 | HV_STIMER_SINT(stimer->config) == sint) { | 179 | stimer->config.sintx == sint) |
| 207 | set_bit(stimer->index, | 180 | stimer_mark_pending(stimer, false); |
| 208 | hv_vcpu->stimer_pending_bitmap); | ||
| 209 | stimers_pending++; | ||
| 210 | } | ||
| 211 | } | 181 | } |
| 212 | if (stimers_pending) | ||
| 213 | kvm_make_request(KVM_REQ_HV_STIMER, vcpu); | ||
| 214 | 182 | ||
| 215 | idx = srcu_read_lock(&kvm->irq_srcu); | 183 | idx = srcu_read_lock(&kvm->irq_srcu); |
| 216 | gsi = atomic_read(&synic->sint_to_gsi[sint]); | 184 | gsi = atomic_read(&synic->sint_to_gsi[sint]); |
| @@ -497,7 +465,7 @@ static int stimer_start(struct kvm_vcpu_hv_stimer *stimer) | |||
| 497 | time_now = get_time_ref_counter(stimer_to_vcpu(stimer)->kvm); | 465 | time_now = get_time_ref_counter(stimer_to_vcpu(stimer)->kvm); |
| 498 | ktime_now = ktime_get(); | 466 | ktime_now = ktime_get(); |
| 499 | 467 | ||
| 500 | if (stimer->config & HV_STIMER_PERIODIC) { | 468 | if (stimer->config.periodic) { |
| 501 | if (stimer->exp_time) { | 469 | if (stimer->exp_time) { |
| 502 | if (time_now >= stimer->exp_time) { | 470 | if (time_now >= stimer->exp_time) { |
| 503 | u64 remainder; | 471 | u64 remainder; |
| @@ -546,13 +514,18 @@ static int stimer_start(struct kvm_vcpu_hv_stimer *stimer) | |||
| 546 | static int stimer_set_config(struct kvm_vcpu_hv_stimer *stimer, u64 config, | 514 | static int stimer_set_config(struct kvm_vcpu_hv_stimer *stimer, u64 config, |
| 547 | bool host) | 515 | bool host) |
| 548 | { | 516 | { |
| 517 | union hv_stimer_config new_config = {.as_uint64 = config}, | ||
| 518 | old_config = {.as_uint64 = stimer->config.as_uint64}; | ||
| 519 | |||
| 549 | trace_kvm_hv_stimer_set_config(stimer_to_vcpu(stimer)->vcpu_id, | 520 | trace_kvm_hv_stimer_set_config(stimer_to_vcpu(stimer)->vcpu_id, |
| 550 | stimer->index, config, host); | 521 | stimer->index, config, host); |
| 551 | 522 | ||
| 552 | stimer_cleanup(stimer); | 523 | stimer_cleanup(stimer); |
| 553 | if ((stimer->config & HV_STIMER_ENABLE) && HV_STIMER_SINT(config) == 0) | 524 | if (old_config.enable && |
| 554 | config &= ~HV_STIMER_ENABLE; | 525 | !new_config.direct_mode && new_config.sintx == 0) |
| 555 | stimer->config = config; | 526 | new_config.enable = 0; |
| 527 | stimer->config.as_uint64 = new_config.as_uint64; | ||
| 528 | |||
| 556 | stimer_mark_pending(stimer, false); | 529 | stimer_mark_pending(stimer, false); |
| 557 | return 0; | 530 | return 0; |
| 558 | } | 531 | } |
| @@ -566,16 +539,16 @@ static int stimer_set_count(struct kvm_vcpu_hv_stimer *stimer, u64 count, | |||
| 566 | stimer_cleanup(stimer); | 539 | stimer_cleanup(stimer); |
| 567 | stimer->count = count; | 540 | stimer->count = count; |
| 568 | if (stimer->count == 0) | 541 | if (stimer->count == 0) |
| 569 | stimer->config &= ~HV_STIMER_ENABLE; | 542 | stimer->config.enable = 0; |
| 570 | else if (stimer->config & HV_STIMER_AUTOENABLE) | 543 | else if (stimer->config.auto_enable) |
| 571 | stimer->config |= HV_STIMER_ENABLE; | 544 | stimer->config.enable = 1; |
| 572 | stimer_mark_pending(stimer, false); | 545 | stimer_mark_pending(stimer, false); |
| 573 | return 0; | 546 | return 0; |
| 574 | } | 547 | } |
| 575 | 548 | ||
| 576 | static int stimer_get_config(struct kvm_vcpu_hv_stimer *stimer, u64 *pconfig) | 549 | static int stimer_get_config(struct kvm_vcpu_hv_stimer *stimer, u64 *pconfig) |
| 577 | { | 550 | { |
| 578 | *pconfig = stimer->config; | 551 | *pconfig = stimer->config.as_uint64; |
| 579 | return 0; | 552 | return 0; |
| 580 | } | 553 | } |
| 581 | 554 | ||
| @@ -586,44 +559,60 @@ static int stimer_get_count(struct kvm_vcpu_hv_stimer *stimer, u64 *pcount) | |||
| 586 | } | 559 | } |
| 587 | 560 | ||
| 588 | static int synic_deliver_msg(struct kvm_vcpu_hv_synic *synic, u32 sint, | 561 | static int synic_deliver_msg(struct kvm_vcpu_hv_synic *synic, u32 sint, |
| 589 | struct hv_message *src_msg) | 562 | struct hv_message *src_msg, bool no_retry) |
| 590 | { | 563 | { |
| 591 | struct kvm_vcpu *vcpu = synic_to_vcpu(synic); | 564 | struct kvm_vcpu *vcpu = synic_to_vcpu(synic); |
| 592 | struct page *page; | 565 | int msg_off = offsetof(struct hv_message_page, sint_message[sint]); |
| 593 | gpa_t gpa; | 566 | gfn_t msg_page_gfn; |
| 594 | struct hv_message *dst_msg; | 567 | struct hv_message_header hv_hdr; |
| 595 | int r; | 568 | int r; |
| 596 | struct hv_message_page *msg_page; | ||
| 597 | 569 | ||
| 598 | if (!(synic->msg_page & HV_SYNIC_SIMP_ENABLE)) | 570 | if (!(synic->msg_page & HV_SYNIC_SIMP_ENABLE)) |
| 599 | return -ENOENT; | 571 | return -ENOENT; |
| 600 | 572 | ||
| 601 | gpa = synic->msg_page & PAGE_MASK; | 573 | msg_page_gfn = synic->msg_page >> PAGE_SHIFT; |
| 602 | page = kvm_vcpu_gfn_to_page(vcpu, gpa >> PAGE_SHIFT); | ||
| 603 | if (is_error_page(page)) | ||
| 604 | return -EFAULT; | ||
| 605 | 574 | ||
| 606 | msg_page = kmap_atomic(page); | 575 | /* |
| 607 | dst_msg = &msg_page->sint_message[sint]; | 576 | * Strictly following the spec-mandated ordering would assume setting |
| 608 | if (sync_cmpxchg(&dst_msg->header.message_type, HVMSG_NONE, | 577 | * .msg_pending before checking .message_type. However, this function |
| 609 | src_msg->header.message_type) != HVMSG_NONE) { | 578 | * is only called in vcpu context so the entire update is atomic from |
| 610 | dst_msg->header.message_flags.msg_pending = 1; | 579 | * guest POV and thus the exact order here doesn't matter. |
| 611 | r = -EAGAIN; | 580 | */ |
| 612 | } else { | 581 | r = kvm_vcpu_read_guest_page(vcpu, msg_page_gfn, &hv_hdr.message_type, |
| 613 | memcpy(&dst_msg->u.payload, &src_msg->u.payload, | 582 | msg_off + offsetof(struct hv_message, |
| 614 | src_msg->header.payload_size); | 583 | header.message_type), |
| 615 | dst_msg->header.message_type = src_msg->header.message_type; | 584 | sizeof(hv_hdr.message_type)); |
| 616 | dst_msg->header.payload_size = src_msg->header.payload_size; | 585 | if (r < 0) |
| 617 | r = synic_set_irq(synic, sint); | 586 | return r; |
| 618 | if (r >= 1) | 587 | |
| 619 | r = 0; | 588 | if (hv_hdr.message_type != HVMSG_NONE) { |
| 620 | else if (r == 0) | 589 | if (no_retry) |
| 621 | r = -EFAULT; | 590 | return 0; |
| 591 | |||
| 592 | hv_hdr.message_flags.msg_pending = 1; | ||
| 593 | r = kvm_vcpu_write_guest_page(vcpu, msg_page_gfn, | ||
| 594 | &hv_hdr.message_flags, | ||
| 595 | msg_off + | ||
| 596 | offsetof(struct hv_message, | ||
| 597 | header.message_flags), | ||
| 598 | sizeof(hv_hdr.message_flags)); | ||
| 599 | if (r < 0) | ||
| 600 | return r; | ||
| 601 | return -EAGAIN; | ||
| 622 | } | 602 | } |
| 623 | kunmap_atomic(msg_page); | 603 | |
| 624 | kvm_release_page_dirty(page); | 604 | r = kvm_vcpu_write_guest_page(vcpu, msg_page_gfn, src_msg, msg_off, |
| 625 | kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT); | 605 | sizeof(src_msg->header) + |
| 626 | return r; | 606 | src_msg->header.payload_size); |
| 607 | if (r < 0) | ||
| 608 | return r; | ||
| 609 | |||
| 610 | r = synic_set_irq(synic, sint); | ||
| 611 | if (r < 0) | ||
| 612 | return r; | ||
| 613 | if (r == 0) | ||
| 614 | return -EFAULT; | ||
| 615 | return 0; | ||
| 627 | } | 616 | } |
| 628 | 617 | ||
| 629 | static int stimer_send_msg(struct kvm_vcpu_hv_stimer *stimer) | 618 | static int stimer_send_msg(struct kvm_vcpu_hv_stimer *stimer) |
| @@ -633,24 +622,45 @@ static int stimer_send_msg(struct kvm_vcpu_hv_stimer *stimer) | |||
| 633 | struct hv_timer_message_payload *payload = | 622 | struct hv_timer_message_payload *payload = |
| 634 | (struct hv_timer_message_payload *)&msg->u.payload; | 623 | (struct hv_timer_message_payload *)&msg->u.payload; |
| 635 | 624 | ||
| 625 | /* | ||
| 626 | * To avoid piling up periodic ticks, don't retry message | ||
| 627 | * delivery for them (within "lazy" lost ticks policy). | ||
| 628 | */ | ||
| 629 | bool no_retry = stimer->config.periodic; | ||
| 630 | |||
| 636 | payload->expiration_time = stimer->exp_time; | 631 | payload->expiration_time = stimer->exp_time; |
| 637 | payload->delivery_time = get_time_ref_counter(vcpu->kvm); | 632 | payload->delivery_time = get_time_ref_counter(vcpu->kvm); |
| 638 | return synic_deliver_msg(vcpu_to_synic(vcpu), | 633 | return synic_deliver_msg(vcpu_to_synic(vcpu), |
| 639 | HV_STIMER_SINT(stimer->config), msg); | 634 | stimer->config.sintx, msg, |
| 635 | no_retry); | ||
| 636 | } | ||
| 637 | |||
| 638 | static int stimer_notify_direct(struct kvm_vcpu_hv_stimer *stimer) | ||
| 639 | { | ||
| 640 | struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer); | ||
| 641 | struct kvm_lapic_irq irq = { | ||
| 642 | .delivery_mode = APIC_DM_FIXED, | ||
| 643 | .vector = stimer->config.apic_vector | ||
| 644 | }; | ||
| 645 | |||
| 646 | return !kvm_apic_set_irq(vcpu, &irq, NULL); | ||
| 640 | } | 647 | } |
| 641 | 648 | ||
| 642 | static void stimer_expiration(struct kvm_vcpu_hv_stimer *stimer) | 649 | static void stimer_expiration(struct kvm_vcpu_hv_stimer *stimer) |
| 643 | { | 650 | { |
| 644 | int r; | 651 | int r, direct = stimer->config.direct_mode; |
| 645 | 652 | ||
| 646 | stimer->msg_pending = true; | 653 | stimer->msg_pending = true; |
| 647 | r = stimer_send_msg(stimer); | 654 | if (!direct) |
| 655 | r = stimer_send_msg(stimer); | ||
| 656 | else | ||
| 657 | r = stimer_notify_direct(stimer); | ||
| 648 | trace_kvm_hv_stimer_expiration(stimer_to_vcpu(stimer)->vcpu_id, | 658 | trace_kvm_hv_stimer_expiration(stimer_to_vcpu(stimer)->vcpu_id, |
| 649 | stimer->index, r); | 659 | stimer->index, direct, r); |
| 650 | if (!r) { | 660 | if (!r) { |
| 651 | stimer->msg_pending = false; | 661 | stimer->msg_pending = false; |
| 652 | if (!(stimer->config & HV_STIMER_PERIODIC)) | 662 | if (!(stimer->config.periodic)) |
| 653 | stimer->config &= ~HV_STIMER_ENABLE; | 663 | stimer->config.enable = 0; |
| 654 | } | 664 | } |
| 655 | } | 665 | } |
| 656 | 666 | ||
| @@ -664,7 +674,7 @@ void kvm_hv_process_stimers(struct kvm_vcpu *vcpu) | |||
| 664 | for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++) | 674 | for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++) |
| 665 | if (test_and_clear_bit(i, hv_vcpu->stimer_pending_bitmap)) { | 675 | if (test_and_clear_bit(i, hv_vcpu->stimer_pending_bitmap)) { |
| 666 | stimer = &hv_vcpu->stimer[i]; | 676 | stimer = &hv_vcpu->stimer[i]; |
| 667 | if (stimer->config & HV_STIMER_ENABLE) { | 677 | if (stimer->config.enable) { |
| 668 | exp_time = stimer->exp_time; | 678 | exp_time = stimer->exp_time; |
| 669 | 679 | ||
| 670 | if (exp_time) { | 680 | if (exp_time) { |
| @@ -674,7 +684,7 @@ void kvm_hv_process_stimers(struct kvm_vcpu *vcpu) | |||
| 674 | stimer_expiration(stimer); | 684 | stimer_expiration(stimer); |
| 675 | } | 685 | } |
| 676 | 686 | ||
| 677 | if ((stimer->config & HV_STIMER_ENABLE) && | 687 | if ((stimer->config.enable) && |
| 678 | stimer->count) { | 688 | stimer->count) { |
| 679 | if (!stimer->msg_pending) | 689 | if (!stimer->msg_pending) |
| 680 | stimer_start(stimer); | 690 | stimer_start(stimer); |
| @@ -815,9 +825,9 @@ static int kvm_hv_msr_set_crash_ctl(struct kvm_vcpu *vcpu, u64 data, bool host) | |||
| 815 | struct kvm_hv *hv = &vcpu->kvm->arch.hyperv; | 825 | struct kvm_hv *hv = &vcpu->kvm->arch.hyperv; |
| 816 | 826 | ||
| 817 | if (host) | 827 | if (host) |
| 818 | hv->hv_crash_ctl = data & HV_X64_MSR_CRASH_CTL_NOTIFY; | 828 | hv->hv_crash_ctl = data & HV_CRASH_CTL_CRASH_NOTIFY; |
| 819 | 829 | ||
| 820 | if (!host && (data & HV_X64_MSR_CRASH_CTL_NOTIFY)) { | 830 | if (!host && (data & HV_CRASH_CTL_CRASH_NOTIFY)) { |
| 821 | 831 | ||
| 822 | vcpu_debug(vcpu, "hv crash (0x%llx 0x%llx 0x%llx 0x%llx 0x%llx)\n", | 832 | vcpu_debug(vcpu, "hv crash (0x%llx 0x%llx 0x%llx 0x%llx 0x%llx)\n", |
| 823 | hv->hv_crash_param[0], | 833 | hv->hv_crash_param[0], |
| @@ -1758,3 +1768,124 @@ int kvm_vm_ioctl_hv_eventfd(struct kvm *kvm, struct kvm_hyperv_eventfd *args) | |||
| 1758 | return kvm_hv_eventfd_deassign(kvm, args->conn_id); | 1768 | return kvm_hv_eventfd_deassign(kvm, args->conn_id); |
| 1759 | return kvm_hv_eventfd_assign(kvm, args->conn_id, args->fd); | 1769 | return kvm_hv_eventfd_assign(kvm, args->conn_id, args->fd); |
| 1760 | } | 1770 | } |
| 1771 | |||
| 1772 | int kvm_vcpu_ioctl_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, | ||
| 1773 | struct kvm_cpuid_entry2 __user *entries) | ||
| 1774 | { | ||
| 1775 | uint16_t evmcs_ver = kvm_x86_ops->nested_get_evmcs_version(vcpu); | ||
| 1776 | struct kvm_cpuid_entry2 cpuid_entries[] = { | ||
| 1777 | { .function = HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS }, | ||
| 1778 | { .function = HYPERV_CPUID_INTERFACE }, | ||
| 1779 | { .function = HYPERV_CPUID_VERSION }, | ||
| 1780 | { .function = HYPERV_CPUID_FEATURES }, | ||
| 1781 | { .function = HYPERV_CPUID_ENLIGHTMENT_INFO }, | ||
| 1782 | { .function = HYPERV_CPUID_IMPLEMENT_LIMITS }, | ||
| 1783 | { .function = HYPERV_CPUID_NESTED_FEATURES }, | ||
| 1784 | }; | ||
| 1785 | int i, nent = ARRAY_SIZE(cpuid_entries); | ||
| 1786 | |||
| 1787 | /* Skip NESTED_FEATURES if eVMCS is not supported */ | ||
| 1788 | if (!evmcs_ver) | ||
| 1789 | --nent; | ||
| 1790 | |||
| 1791 | if (cpuid->nent < nent) | ||
| 1792 | return -E2BIG; | ||
| 1793 | |||
| 1794 | if (cpuid->nent > nent) | ||
| 1795 | cpuid->nent = nent; | ||
| 1796 | |||
| 1797 | for (i = 0; i < nent; i++) { | ||
| 1798 | struct kvm_cpuid_entry2 *ent = &cpuid_entries[i]; | ||
| 1799 | u32 signature[3]; | ||
| 1800 | |||
| 1801 | switch (ent->function) { | ||
| 1802 | case HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS: | ||
| 1803 | memcpy(signature, "Linux KVM Hv", 12); | ||
| 1804 | |||
| 1805 | ent->eax = HYPERV_CPUID_NESTED_FEATURES; | ||
| 1806 | ent->ebx = signature[0]; | ||
| 1807 | ent->ecx = signature[1]; | ||
| 1808 | ent->edx = signature[2]; | ||
| 1809 | break; | ||
| 1810 | |||
| 1811 | case HYPERV_CPUID_INTERFACE: | ||
| 1812 | memcpy(signature, "Hv#1\0\0\0\0\0\0\0\0", 12); | ||
| 1813 | ent->eax = signature[0]; | ||
| 1814 | break; | ||
| 1815 | |||
| 1816 | case HYPERV_CPUID_VERSION: | ||
| 1817 | /* | ||
| 1818 | * We implement some Hyper-V 2016 functions so let's use | ||
| 1819 | * this version. | ||
| 1820 | */ | ||
| 1821 | ent->eax = 0x00003839; | ||
| 1822 | ent->ebx = 0x000A0000; | ||
| 1823 | break; | ||
| 1824 | |||
| 1825 | case HYPERV_CPUID_FEATURES: | ||
| 1826 | ent->eax |= HV_X64_MSR_VP_RUNTIME_AVAILABLE; | ||
| 1827 | ent->eax |= HV_MSR_TIME_REF_COUNT_AVAILABLE; | ||
| 1828 | ent->eax |= HV_X64_MSR_SYNIC_AVAILABLE; | ||
| 1829 | ent->eax |= HV_MSR_SYNTIMER_AVAILABLE; | ||
| 1830 | ent->eax |= HV_X64_MSR_APIC_ACCESS_AVAILABLE; | ||
| 1831 | ent->eax |= HV_X64_MSR_HYPERCALL_AVAILABLE; | ||
| 1832 | ent->eax |= HV_X64_MSR_VP_INDEX_AVAILABLE; | ||
| 1833 | ent->eax |= HV_X64_MSR_RESET_AVAILABLE; | ||
| 1834 | ent->eax |= HV_MSR_REFERENCE_TSC_AVAILABLE; | ||
| 1835 | ent->eax |= HV_X64_MSR_GUEST_IDLE_AVAILABLE; | ||
| 1836 | ent->eax |= HV_X64_ACCESS_FREQUENCY_MSRS; | ||
| 1837 | ent->eax |= HV_X64_ACCESS_REENLIGHTENMENT; | ||
| 1838 | |||
| 1839 | ent->ebx |= HV_X64_POST_MESSAGES; | ||
| 1840 | ent->ebx |= HV_X64_SIGNAL_EVENTS; | ||
| 1841 | |||
| 1842 | ent->edx |= HV_FEATURE_FREQUENCY_MSRS_AVAILABLE; | ||
| 1843 | ent->edx |= HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE; | ||
| 1844 | ent->edx |= HV_STIMER_DIRECT_MODE_AVAILABLE; | ||
| 1845 | |||
| 1846 | break; | ||
| 1847 | |||
| 1848 | case HYPERV_CPUID_ENLIGHTMENT_INFO: | ||
| 1849 | ent->eax |= HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED; | ||
| 1850 | ent->eax |= HV_X64_APIC_ACCESS_RECOMMENDED; | ||
| 1851 | ent->eax |= HV_X64_SYSTEM_RESET_RECOMMENDED; | ||
| 1852 | ent->eax |= HV_X64_RELAXED_TIMING_RECOMMENDED; | ||
| 1853 | ent->eax |= HV_X64_CLUSTER_IPI_RECOMMENDED; | ||
| 1854 | ent->eax |= HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED; | ||
| 1855 | ent->eax |= HV_X64_ENLIGHTENED_VMCS_RECOMMENDED; | ||
| 1856 | |||
| 1857 | /* | ||
| 1858 | * Default number of spinlock retry attempts, matches | ||
| 1859 | * HyperV 2016. | ||
| 1860 | */ | ||
| 1861 | ent->ebx = 0x00000FFF; | ||
| 1862 | |||
| 1863 | break; | ||
| 1864 | |||
| 1865 | case HYPERV_CPUID_IMPLEMENT_LIMITS: | ||
| 1866 | /* Maximum number of virtual processors */ | ||
| 1867 | ent->eax = KVM_MAX_VCPUS; | ||
| 1868 | /* | ||
| 1869 | * Maximum number of logical processors, matches | ||
| 1870 | * HyperV 2016. | ||
| 1871 | */ | ||
| 1872 | ent->ebx = 64; | ||
| 1873 | |||
| 1874 | break; | ||
| 1875 | |||
| 1876 | case HYPERV_CPUID_NESTED_FEATURES: | ||
| 1877 | ent->eax = evmcs_ver; | ||
| 1878 | |||
| 1879 | break; | ||
| 1880 | |||
| 1881 | default: | ||
| 1882 | break; | ||
| 1883 | } | ||
| 1884 | } | ||
| 1885 | |||
| 1886 | if (copy_to_user(entries, cpuid_entries, | ||
| 1887 | nent * sizeof(struct kvm_cpuid_entry2))) | ||
| 1888 | return -EFAULT; | ||
| 1889 | |||
| 1890 | return 0; | ||
| 1891 | } | ||
diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h index 0e66c12ed2c3..fd7cf13a2144 100644 --- a/arch/x86/kvm/hyperv.h +++ b/arch/x86/kvm/hyperv.h | |||
| @@ -24,6 +24,8 @@ | |||
| 24 | #ifndef __ARCH_X86_KVM_HYPERV_H__ | 24 | #ifndef __ARCH_X86_KVM_HYPERV_H__ |
| 25 | #define __ARCH_X86_KVM_HYPERV_H__ | 25 | #define __ARCH_X86_KVM_HYPERV_H__ |
| 26 | 26 | ||
| 27 | #include <linux/kvm_host.h> | ||
| 28 | |||
| 27 | static inline struct kvm_vcpu_hv *vcpu_to_hv_vcpu(struct kvm_vcpu *vcpu) | 29 | static inline struct kvm_vcpu_hv *vcpu_to_hv_vcpu(struct kvm_vcpu *vcpu) |
| 28 | { | 30 | { |
| 29 | return &vcpu->arch.hyperv; | 31 | return &vcpu->arch.hyperv; |
| @@ -95,5 +97,7 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm, | |||
| 95 | void kvm_hv_init_vm(struct kvm *kvm); | 97 | void kvm_hv_init_vm(struct kvm *kvm); |
| 96 | void kvm_hv_destroy_vm(struct kvm *kvm); | 98 | void kvm_hv_destroy_vm(struct kvm *kvm); |
| 97 | int kvm_vm_ioctl_hv_eventfd(struct kvm *kvm, struct kvm_hyperv_eventfd *args); | 99 | int kvm_vm_ioctl_hv_eventfd(struct kvm *kvm, struct kvm_hyperv_eventfd *args); |
| 100 | int kvm_vcpu_ioctl_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, | ||
| 101 | struct kvm_cpuid_entry2 __user *entries); | ||
| 98 | 102 | ||
| 99 | #endif | 103 | #endif |
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index 9619dcc2b325..f8f56a93358b 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h | |||
| @@ -2,6 +2,8 @@ | |||
| 2 | #ifndef ASM_KVM_CACHE_REGS_H | 2 | #ifndef ASM_KVM_CACHE_REGS_H |
| 3 | #define ASM_KVM_CACHE_REGS_H | 3 | #define ASM_KVM_CACHE_REGS_H |
| 4 | 4 | ||
| 5 | #include <linux/kvm_host.h> | ||
| 6 | |||
| 5 | #define KVM_POSSIBLE_CR0_GUEST_BITS X86_CR0_TS | 7 | #define KVM_POSSIBLE_CR0_GUEST_BITS X86_CR0_TS |
| 6 | #define KVM_POSSIBLE_CR4_GUEST_BITS \ | 8 | #define KVM_POSSIBLE_CR4_GUEST_BITS \ |
| 7 | (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \ | 9 | (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \ |
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index c4533d05c214..9f089e2e09d0 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c | |||
| @@ -251,10 +251,9 @@ static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val) | |||
| 251 | 251 | ||
| 252 | if (enabled != apic->sw_enabled) { | 252 | if (enabled != apic->sw_enabled) { |
| 253 | apic->sw_enabled = enabled; | 253 | apic->sw_enabled = enabled; |
| 254 | if (enabled) { | 254 | if (enabled) |
| 255 | static_key_slow_dec_deferred(&apic_sw_disabled); | 255 | static_key_slow_dec_deferred(&apic_sw_disabled); |
| 256 | recalculate_apic_map(apic->vcpu->kvm); | 256 | else |
| 257 | } else | ||
| 258 | static_key_slow_inc(&apic_sw_disabled.key); | 257 | static_key_slow_inc(&apic_sw_disabled.key); |
| 259 | } | 258 | } |
| 260 | } | 259 | } |
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 7c03c0f35444..ce770b446238 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c | |||
| @@ -264,6 +264,35 @@ static void mmu_spte_set(u64 *sptep, u64 spte); | |||
| 264 | static union kvm_mmu_page_role | 264 | static union kvm_mmu_page_role |
| 265 | kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu); | 265 | kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu); |
| 266 | 266 | ||
| 267 | |||
| 268 | static inline bool kvm_available_flush_tlb_with_range(void) | ||
| 269 | { | ||
| 270 | return kvm_x86_ops->tlb_remote_flush_with_range; | ||
| 271 | } | ||
| 272 | |||
| 273 | static void kvm_flush_remote_tlbs_with_range(struct kvm *kvm, | ||
| 274 | struct kvm_tlb_range *range) | ||
| 275 | { | ||
| 276 | int ret = -ENOTSUPP; | ||
| 277 | |||
| 278 | if (range && kvm_x86_ops->tlb_remote_flush_with_range) | ||
| 279 | ret = kvm_x86_ops->tlb_remote_flush_with_range(kvm, range); | ||
| 280 | |||
| 281 | if (ret) | ||
| 282 | kvm_flush_remote_tlbs(kvm); | ||
| 283 | } | ||
| 284 | |||
| 285 | static void kvm_flush_remote_tlbs_with_address(struct kvm *kvm, | ||
| 286 | u64 start_gfn, u64 pages) | ||
| 287 | { | ||
| 288 | struct kvm_tlb_range range; | ||
| 289 | |||
| 290 | range.start_gfn = start_gfn; | ||
| 291 | range.pages = pages; | ||
| 292 | |||
| 293 | kvm_flush_remote_tlbs_with_range(kvm, &range); | ||
| 294 | } | ||
| 295 | |||
| 267 | void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value) | 296 | void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value) |
| 268 | { | 297 | { |
| 269 | BUG_ON((mmio_mask & mmio_value) != mmio_value); | 298 | BUG_ON((mmio_mask & mmio_value) != mmio_value); |
| @@ -1456,8 +1485,12 @@ static bool __drop_large_spte(struct kvm *kvm, u64 *sptep) | |||
| 1456 | 1485 | ||
| 1457 | static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) | 1486 | static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) |
| 1458 | { | 1487 | { |
| 1459 | if (__drop_large_spte(vcpu->kvm, sptep)) | 1488 | if (__drop_large_spte(vcpu->kvm, sptep)) { |
| 1460 | kvm_flush_remote_tlbs(vcpu->kvm); | 1489 | struct kvm_mmu_page *sp = page_header(__pa(sptep)); |
| 1490 | |||
| 1491 | kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn, | ||
| 1492 | KVM_PAGES_PER_HPAGE(sp->role.level)); | ||
| 1493 | } | ||
| 1461 | } | 1494 | } |
| 1462 | 1495 | ||
| 1463 | /* | 1496 | /* |
| @@ -1743,10 +1776,12 @@ restart: | |||
| 1743 | } | 1776 | } |
| 1744 | } | 1777 | } |
| 1745 | 1778 | ||
| 1746 | if (need_flush) | 1779 | if (need_flush && kvm_available_flush_tlb_with_range()) { |
| 1747 | kvm_flush_remote_tlbs(kvm); | 1780 | kvm_flush_remote_tlbs_with_address(kvm, gfn, 1); |
| 1781 | return 0; | ||
| 1782 | } | ||
| 1748 | 1783 | ||
| 1749 | return 0; | 1784 | return need_flush; |
| 1750 | } | 1785 | } |
| 1751 | 1786 | ||
| 1752 | struct slot_rmap_walk_iterator { | 1787 | struct slot_rmap_walk_iterator { |
| @@ -1880,9 +1915,9 @@ int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end) | |||
| 1880 | return kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp); | 1915 | return kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp); |
| 1881 | } | 1916 | } |
| 1882 | 1917 | ||
| 1883 | void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) | 1918 | int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) |
| 1884 | { | 1919 | { |
| 1885 | kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp); | 1920 | return kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp); |
| 1886 | } | 1921 | } |
| 1887 | 1922 | ||
| 1888 | static int kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, | 1923 | static int kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, |
| @@ -1925,7 +1960,8 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) | |||
| 1925 | rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp); | 1960 | rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp); |
| 1926 | 1961 | ||
| 1927 | kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, 0); | 1962 | kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, 0); |
| 1928 | kvm_flush_remote_tlbs(vcpu->kvm); | 1963 | kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn, |
| 1964 | KVM_PAGES_PER_HPAGE(sp->role.level)); | ||
| 1929 | } | 1965 | } |
| 1930 | 1966 | ||
| 1931 | int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) | 1967 | int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) |
| @@ -2441,7 +2477,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, | |||
| 2441 | account_shadowed(vcpu->kvm, sp); | 2477 | account_shadowed(vcpu->kvm, sp); |
| 2442 | if (level == PT_PAGE_TABLE_LEVEL && | 2478 | if (level == PT_PAGE_TABLE_LEVEL && |
| 2443 | rmap_write_protect(vcpu, gfn)) | 2479 | rmap_write_protect(vcpu, gfn)) |
| 2444 | kvm_flush_remote_tlbs(vcpu->kvm); | 2480 | kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn, 1); |
| 2445 | 2481 | ||
| 2446 | if (level > PT_PAGE_TABLE_LEVEL && need_sync) | 2482 | if (level > PT_PAGE_TABLE_LEVEL && need_sync) |
| 2447 | flush |= kvm_sync_pages(vcpu, gfn, &invalid_list); | 2483 | flush |= kvm_sync_pages(vcpu, gfn, &invalid_list); |
| @@ -2561,7 +2597,7 @@ static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, | |||
| 2561 | return; | 2597 | return; |
| 2562 | 2598 | ||
| 2563 | drop_parent_pte(child, sptep); | 2599 | drop_parent_pte(child, sptep); |
| 2564 | kvm_flush_remote_tlbs(vcpu->kvm); | 2600 | kvm_flush_remote_tlbs_with_address(vcpu->kvm, child->gfn, 1); |
| 2565 | } | 2601 | } |
| 2566 | } | 2602 | } |
| 2567 | 2603 | ||
| @@ -2985,8 +3021,10 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, | |||
| 2985 | ret = RET_PF_EMULATE; | 3021 | ret = RET_PF_EMULATE; |
| 2986 | kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); | 3022 | kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); |
| 2987 | } | 3023 | } |
| 3024 | |||
| 2988 | if (set_spte_ret & SET_SPTE_NEED_REMOTE_TLB_FLUSH || flush) | 3025 | if (set_spte_ret & SET_SPTE_NEED_REMOTE_TLB_FLUSH || flush) |
| 2989 | kvm_flush_remote_tlbs(vcpu->kvm); | 3026 | kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn, |
| 3027 | KVM_PAGES_PER_HPAGE(level)); | ||
| 2990 | 3028 | ||
| 2991 | if (unlikely(is_mmio_spte(*sptep))) | 3029 | if (unlikely(is_mmio_spte(*sptep))) |
| 2992 | ret = RET_PF_EMULATE; | 3030 | ret = RET_PF_EMULATE; |
| @@ -5586,8 +5624,13 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) | |||
| 5586 | { | 5624 | { |
| 5587 | struct kvm_memslots *slots; | 5625 | struct kvm_memslots *slots; |
| 5588 | struct kvm_memory_slot *memslot; | 5626 | struct kvm_memory_slot *memslot; |
| 5627 | bool flush_tlb = true; | ||
| 5628 | bool flush = false; | ||
| 5589 | int i; | 5629 | int i; |
| 5590 | 5630 | ||
| 5631 | if (kvm_available_flush_tlb_with_range()) | ||
| 5632 | flush_tlb = false; | ||
| 5633 | |||
| 5591 | spin_lock(&kvm->mmu_lock); | 5634 | spin_lock(&kvm->mmu_lock); |
| 5592 | for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { | 5635 | for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { |
| 5593 | slots = __kvm_memslots(kvm, i); | 5636 | slots = __kvm_memslots(kvm, i); |
| @@ -5599,12 +5642,17 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) | |||
| 5599 | if (start >= end) | 5642 | if (start >= end) |
| 5600 | continue; | 5643 | continue; |
| 5601 | 5644 | ||
| 5602 | slot_handle_level_range(kvm, memslot, kvm_zap_rmapp, | 5645 | flush |= slot_handle_level_range(kvm, memslot, |
| 5603 | PT_PAGE_TABLE_LEVEL, PT_MAX_HUGEPAGE_LEVEL, | 5646 | kvm_zap_rmapp, PT_PAGE_TABLE_LEVEL, |
| 5604 | start, end - 1, true); | 5647 | PT_MAX_HUGEPAGE_LEVEL, start, |
| 5648 | end - 1, flush_tlb); | ||
| 5605 | } | 5649 | } |
| 5606 | } | 5650 | } |
| 5607 | 5651 | ||
| 5652 | if (flush) | ||
| 5653 | kvm_flush_remote_tlbs_with_address(kvm, gfn_start, | ||
| 5654 | gfn_end - gfn_start + 1); | ||
| 5655 | |||
| 5608 | spin_unlock(&kvm->mmu_lock); | 5656 | spin_unlock(&kvm->mmu_lock); |
| 5609 | } | 5657 | } |
| 5610 | 5658 | ||
| @@ -5638,12 +5686,13 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, | |||
| 5638 | * spte from present to present (changing the spte from present | 5686 | * spte from present to present (changing the spte from present |
| 5639 | * to nonpresent will flush all the TLBs immediately), in other | 5687 | * to nonpresent will flush all the TLBs immediately), in other |
| 5640 | * words, the only case we care is mmu_spte_update() where we | 5688 | * words, the only case we care is mmu_spte_update() where we |
| 5641 | * haved checked SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE | 5689 | * have checked SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE |
| 5642 | * instead of PT_WRITABLE_MASK, that means it does not depend | 5690 | * instead of PT_WRITABLE_MASK, that means it does not depend |
| 5643 | * on PT_WRITABLE_MASK anymore. | 5691 | * on PT_WRITABLE_MASK anymore. |
| 5644 | */ | 5692 | */ |
| 5645 | if (flush) | 5693 | if (flush) |
| 5646 | kvm_flush_remote_tlbs(kvm); | 5694 | kvm_flush_remote_tlbs_with_address(kvm, memslot->base_gfn, |
| 5695 | memslot->npages); | ||
| 5647 | } | 5696 | } |
| 5648 | 5697 | ||
| 5649 | static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm, | 5698 | static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm, |
| @@ -5671,7 +5720,13 @@ restart: | |||
| 5671 | !kvm_is_reserved_pfn(pfn) && | 5720 | !kvm_is_reserved_pfn(pfn) && |
| 5672 | PageTransCompoundMap(pfn_to_page(pfn))) { | 5721 | PageTransCompoundMap(pfn_to_page(pfn))) { |
| 5673 | pte_list_remove(rmap_head, sptep); | 5722 | pte_list_remove(rmap_head, sptep); |
| 5674 | need_tlb_flush = 1; | 5723 | |
| 5724 | if (kvm_available_flush_tlb_with_range()) | ||
| 5725 | kvm_flush_remote_tlbs_with_address(kvm, sp->gfn, | ||
| 5726 | KVM_PAGES_PER_HPAGE(sp->role.level)); | ||
| 5727 | else | ||
| 5728 | need_tlb_flush = 1; | ||
| 5729 | |||
| 5675 | goto restart; | 5730 | goto restart; |
| 5676 | } | 5731 | } |
| 5677 | } | 5732 | } |
| @@ -5707,7 +5762,8 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, | |||
| 5707 | * dirty_bitmap. | 5762 | * dirty_bitmap. |
| 5708 | */ | 5763 | */ |
| 5709 | if (flush) | 5764 | if (flush) |
| 5710 | kvm_flush_remote_tlbs(kvm); | 5765 | kvm_flush_remote_tlbs_with_address(kvm, memslot->base_gfn, |
| 5766 | memslot->npages); | ||
| 5711 | } | 5767 | } |
| 5712 | EXPORT_SYMBOL_GPL(kvm_mmu_slot_leaf_clear_dirty); | 5768 | EXPORT_SYMBOL_GPL(kvm_mmu_slot_leaf_clear_dirty); |
| 5713 | 5769 | ||
| @@ -5725,7 +5781,8 @@ void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm, | |||
| 5725 | lockdep_assert_held(&kvm->slots_lock); | 5781 | lockdep_assert_held(&kvm->slots_lock); |
| 5726 | 5782 | ||
| 5727 | if (flush) | 5783 | if (flush) |
| 5728 | kvm_flush_remote_tlbs(kvm); | 5784 | kvm_flush_remote_tlbs_with_address(kvm, memslot->base_gfn, |
| 5785 | memslot->npages); | ||
| 5729 | } | 5786 | } |
| 5730 | EXPORT_SYMBOL_GPL(kvm_mmu_slot_largepage_remove_write_access); | 5787 | EXPORT_SYMBOL_GPL(kvm_mmu_slot_largepage_remove_write_access); |
| 5731 | 5788 | ||
| @@ -5742,7 +5799,8 @@ void kvm_mmu_slot_set_dirty(struct kvm *kvm, | |||
| 5742 | 5799 | ||
| 5743 | /* see kvm_mmu_slot_leaf_clear_dirty */ | 5800 | /* see kvm_mmu_slot_leaf_clear_dirty */ |
| 5744 | if (flush) | 5801 | if (flush) |
| 5745 | kvm_flush_remote_tlbs(kvm); | 5802 | kvm_flush_remote_tlbs_with_address(kvm, memslot->base_gfn, |
| 5803 | memslot->npages); | ||
| 5746 | } | 5804 | } |
| 5747 | EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty); | 5805 | EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty); |
| 5748 | 5806 | ||
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 7cf2185b7eb5..6bdca39829bc 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h | |||
| @@ -894,7 +894,8 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa) | |||
| 894 | pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t); | 894 | pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t); |
| 895 | 895 | ||
| 896 | if (mmu_page_zap_pte(vcpu->kvm, sp, sptep)) | 896 | if (mmu_page_zap_pte(vcpu->kvm, sp, sptep)) |
| 897 | kvm_flush_remote_tlbs(vcpu->kvm); | 897 | kvm_flush_remote_tlbs_with_address(vcpu->kvm, |
| 898 | sp->gfn, KVM_PAGES_PER_HPAGE(sp->role.level)); | ||
| 898 | 899 | ||
| 899 | if (!rmap_can_add(vcpu)) | 900 | if (!rmap_can_add(vcpu)) |
| 900 | break; | 901 | break; |
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 101f53ccf571..307e5bddb6d9 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c | |||
| @@ -675,11 +675,6 @@ struct svm_cpu_data { | |||
| 675 | 675 | ||
| 676 | static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data); | 676 | static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data); |
| 677 | 677 | ||
| 678 | struct svm_init_data { | ||
| 679 | int cpu; | ||
| 680 | int r; | ||
| 681 | }; | ||
| 682 | |||
| 683 | static const u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000}; | 678 | static const u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000}; |
| 684 | 679 | ||
| 685 | #define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges) | 680 | #define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges) |
| @@ -711,17 +706,17 @@ static u32 svm_msrpm_offset(u32 msr) | |||
| 711 | 706 | ||
| 712 | static inline void clgi(void) | 707 | static inline void clgi(void) |
| 713 | { | 708 | { |
| 714 | asm volatile (__ex(SVM_CLGI)); | 709 | asm volatile (__ex("clgi")); |
| 715 | } | 710 | } |
| 716 | 711 | ||
| 717 | static inline void stgi(void) | 712 | static inline void stgi(void) |
| 718 | { | 713 | { |
| 719 | asm volatile (__ex(SVM_STGI)); | 714 | asm volatile (__ex("stgi")); |
| 720 | } | 715 | } |
| 721 | 716 | ||
| 722 | static inline void invlpga(unsigned long addr, u32 asid) | 717 | static inline void invlpga(unsigned long addr, u32 asid) |
| 723 | { | 718 | { |
| 724 | asm volatile (__ex(SVM_INVLPGA) : : "a"(addr), "c"(asid)); | 719 | asm volatile (__ex("invlpga %1, %0") : : "c"(asid), "a"(addr)); |
| 725 | } | 720 | } |
| 726 | 721 | ||
| 727 | static int get_npt_level(struct kvm_vcpu *vcpu) | 722 | static int get_npt_level(struct kvm_vcpu *vcpu) |
| @@ -1456,10 +1451,11 @@ static u64 svm_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) | |||
| 1456 | g_tsc_offset = svm->vmcb->control.tsc_offset - | 1451 | g_tsc_offset = svm->vmcb->control.tsc_offset - |
| 1457 | svm->nested.hsave->control.tsc_offset; | 1452 | svm->nested.hsave->control.tsc_offset; |
| 1458 | svm->nested.hsave->control.tsc_offset = offset; | 1453 | svm->nested.hsave->control.tsc_offset = offset; |
| 1459 | } else | 1454 | } |
| 1460 | trace_kvm_write_tsc_offset(vcpu->vcpu_id, | 1455 | |
| 1461 | svm->vmcb->control.tsc_offset, | 1456 | trace_kvm_write_tsc_offset(vcpu->vcpu_id, |
| 1462 | offset); | 1457 | svm->vmcb->control.tsc_offset - g_tsc_offset, |
| 1458 | offset); | ||
| 1463 | 1459 | ||
| 1464 | svm->vmcb->control.tsc_offset = offset + g_tsc_offset; | 1460 | svm->vmcb->control.tsc_offset = offset + g_tsc_offset; |
| 1465 | 1461 | ||
| @@ -2129,6 +2125,13 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) | |||
| 2129 | goto out; | 2125 | goto out; |
| 2130 | } | 2126 | } |
| 2131 | 2127 | ||
| 2128 | svm->vcpu.arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache, GFP_KERNEL); | ||
| 2129 | if (!svm->vcpu.arch.guest_fpu) { | ||
| 2130 | printk(KERN_ERR "kvm: failed to allocate vcpu's fpu\n"); | ||
| 2131 | err = -ENOMEM; | ||
| 2132 | goto free_partial_svm; | ||
| 2133 | } | ||
| 2134 | |||
| 2132 | err = kvm_vcpu_init(&svm->vcpu, kvm, id); | 2135 | err = kvm_vcpu_init(&svm->vcpu, kvm, id); |
| 2133 | if (err) | 2136 | if (err) |
| 2134 | goto free_svm; | 2137 | goto free_svm; |
| @@ -2188,6 +2191,8 @@ free_page1: | |||
| 2188 | uninit: | 2191 | uninit: |
| 2189 | kvm_vcpu_uninit(&svm->vcpu); | 2192 | kvm_vcpu_uninit(&svm->vcpu); |
| 2190 | free_svm: | 2193 | free_svm: |
| 2194 | kmem_cache_free(x86_fpu_cache, svm->vcpu.arch.guest_fpu); | ||
| 2195 | free_partial_svm: | ||
| 2191 | kmem_cache_free(kvm_vcpu_cache, svm); | 2196 | kmem_cache_free(kvm_vcpu_cache, svm); |
| 2192 | out: | 2197 | out: |
| 2193 | return ERR_PTR(err); | 2198 | return ERR_PTR(err); |
| @@ -2217,6 +2222,7 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu) | |||
| 2217 | __free_page(virt_to_page(svm->nested.hsave)); | 2222 | __free_page(virt_to_page(svm->nested.hsave)); |
| 2218 | __free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER); | 2223 | __free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER); |
| 2219 | kvm_vcpu_uninit(vcpu); | 2224 | kvm_vcpu_uninit(vcpu); |
| 2225 | kmem_cache_free(x86_fpu_cache, svm->vcpu.arch.guest_fpu); | ||
| 2220 | kmem_cache_free(kvm_vcpu_cache, svm); | 2226 | kmem_cache_free(kvm_vcpu_cache, svm); |
| 2221 | } | 2227 | } |
| 2222 | 2228 | ||
| @@ -3278,6 +3284,8 @@ static inline void copy_vmcb_control_area(struct vmcb *dst_vmcb, struct vmcb *fr | |||
| 3278 | dst->event_inj_err = from->event_inj_err; | 3284 | dst->event_inj_err = from->event_inj_err; |
| 3279 | dst->nested_cr3 = from->nested_cr3; | 3285 | dst->nested_cr3 = from->nested_cr3; |
| 3280 | dst->virt_ext = from->virt_ext; | 3286 | dst->virt_ext = from->virt_ext; |
| 3287 | dst->pause_filter_count = from->pause_filter_count; | ||
| 3288 | dst->pause_filter_thresh = from->pause_filter_thresh; | ||
| 3281 | } | 3289 | } |
| 3282 | 3290 | ||
| 3283 | static int nested_svm_vmexit(struct vcpu_svm *svm) | 3291 | static int nested_svm_vmexit(struct vcpu_svm *svm) |
| @@ -3356,6 +3364,11 @@ static int nested_svm_vmexit(struct vcpu_svm *svm) | |||
| 3356 | nested_vmcb->control.event_inj = 0; | 3364 | nested_vmcb->control.event_inj = 0; |
| 3357 | nested_vmcb->control.event_inj_err = 0; | 3365 | nested_vmcb->control.event_inj_err = 0; |
| 3358 | 3366 | ||
| 3367 | nested_vmcb->control.pause_filter_count = | ||
| 3368 | svm->vmcb->control.pause_filter_count; | ||
| 3369 | nested_vmcb->control.pause_filter_thresh = | ||
| 3370 | svm->vmcb->control.pause_filter_thresh; | ||
| 3371 | |||
| 3359 | /* We always set V_INTR_MASKING and remember the old value in hflags */ | 3372 | /* We always set V_INTR_MASKING and remember the old value in hflags */ |
| 3360 | if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK)) | 3373 | if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK)) |
| 3361 | nested_vmcb->control.int_ctl &= ~V_INTR_MASKING_MASK; | 3374 | nested_vmcb->control.int_ctl &= ~V_INTR_MASKING_MASK; |
| @@ -3532,6 +3545,11 @@ static void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa, | |||
| 3532 | svm->vmcb->control.event_inj = nested_vmcb->control.event_inj; | 3545 | svm->vmcb->control.event_inj = nested_vmcb->control.event_inj; |
| 3533 | svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err; | 3546 | svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err; |
| 3534 | 3547 | ||
| 3548 | svm->vmcb->control.pause_filter_count = | ||
| 3549 | nested_vmcb->control.pause_filter_count; | ||
| 3550 | svm->vmcb->control.pause_filter_thresh = | ||
| 3551 | nested_vmcb->control.pause_filter_thresh; | ||
| 3552 | |||
| 3535 | nested_svm_unmap(page); | 3553 | nested_svm_unmap(page); |
| 3536 | 3554 | ||
| 3537 | /* Enter Guest-Mode */ | 3555 | /* Enter Guest-Mode */ |
| @@ -5636,9 +5654,9 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) | |||
| 5636 | /* Enter guest mode */ | 5654 | /* Enter guest mode */ |
| 5637 | "push %%" _ASM_AX " \n\t" | 5655 | "push %%" _ASM_AX " \n\t" |
| 5638 | "mov %c[vmcb](%[svm]), %%" _ASM_AX " \n\t" | 5656 | "mov %c[vmcb](%[svm]), %%" _ASM_AX " \n\t" |
| 5639 | __ex(SVM_VMLOAD) "\n\t" | 5657 | __ex("vmload %%" _ASM_AX) "\n\t" |
| 5640 | __ex(SVM_VMRUN) "\n\t" | 5658 | __ex("vmrun %%" _ASM_AX) "\n\t" |
| 5641 | __ex(SVM_VMSAVE) "\n\t" | 5659 | __ex("vmsave %%" _ASM_AX) "\n\t" |
| 5642 | "pop %%" _ASM_AX " \n\t" | 5660 | "pop %%" _ASM_AX " \n\t" |
| 5643 | 5661 | ||
| 5644 | /* Save guest registers, load host registers */ | 5662 | /* Save guest registers, load host registers */ |
| @@ -5836,6 +5854,13 @@ static bool svm_cpu_has_accelerated_tpr(void) | |||
| 5836 | 5854 | ||
| 5837 | static bool svm_has_emulated_msr(int index) | 5855 | static bool svm_has_emulated_msr(int index) |
| 5838 | { | 5856 | { |
| 5857 | switch (index) { | ||
| 5858 | case MSR_IA32_MCG_EXT_CTL: | ||
| 5859 | return false; | ||
| 5860 | default: | ||
| 5861 | break; | ||
| 5862 | } | ||
| 5863 | |||
| 5839 | return true; | 5864 | return true; |
| 5840 | } | 5865 | } |
| 5841 | 5866 | ||
| @@ -5924,6 +5949,11 @@ static bool svm_umip_emulated(void) | |||
| 5924 | return false; | 5949 | return false; |
| 5925 | } | 5950 | } |
| 5926 | 5951 | ||
| 5952 | static bool svm_pt_supported(void) | ||
| 5953 | { | ||
| 5954 | return false; | ||
| 5955 | } | ||
| 5956 | |||
| 5927 | static bool svm_has_wbinvd_exit(void) | 5957 | static bool svm_has_wbinvd_exit(void) |
| 5928 | { | 5958 | { |
| 5929 | return true; | 5959 | return true; |
| @@ -7053,6 +7083,12 @@ failed: | |||
| 7053 | return ret; | 7083 | return ret; |
| 7054 | } | 7084 | } |
| 7055 | 7085 | ||
| 7086 | static uint16_t nested_get_evmcs_version(struct kvm_vcpu *vcpu) | ||
| 7087 | { | ||
| 7088 | /* Not supported */ | ||
| 7089 | return 0; | ||
| 7090 | } | ||
| 7091 | |||
| 7056 | static int nested_enable_evmcs(struct kvm_vcpu *vcpu, | 7092 | static int nested_enable_evmcs(struct kvm_vcpu *vcpu, |
| 7057 | uint16_t *vmcs_version) | 7093 | uint16_t *vmcs_version) |
| 7058 | { | 7094 | { |
| @@ -7159,6 +7195,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { | |||
| 7159 | .mpx_supported = svm_mpx_supported, | 7195 | .mpx_supported = svm_mpx_supported, |
| 7160 | .xsaves_supported = svm_xsaves_supported, | 7196 | .xsaves_supported = svm_xsaves_supported, |
| 7161 | .umip_emulated = svm_umip_emulated, | 7197 | .umip_emulated = svm_umip_emulated, |
| 7198 | .pt_supported = svm_pt_supported, | ||
| 7162 | 7199 | ||
| 7163 | .set_supported_cpuid = svm_set_supported_cpuid, | 7200 | .set_supported_cpuid = svm_set_supported_cpuid, |
| 7164 | 7201 | ||
| @@ -7191,6 +7228,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { | |||
| 7191 | .mem_enc_unreg_region = svm_unregister_enc_region, | 7228 | .mem_enc_unreg_region = svm_unregister_enc_region, |
| 7192 | 7229 | ||
| 7193 | .nested_enable_evmcs = nested_enable_evmcs, | 7230 | .nested_enable_evmcs = nested_enable_evmcs, |
| 7231 | .nested_get_evmcs_version = nested_get_evmcs_version, | ||
| 7194 | }; | 7232 | }; |
| 7195 | 7233 | ||
| 7196 | static int __init svm_init(void) | 7234 | static int __init svm_init(void) |
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 0659465a745c..705f40ae2532 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h | |||
| @@ -1254,24 +1254,26 @@ TRACE_EVENT(kvm_hv_stimer_callback, | |||
| 1254 | * Tracepoint for stimer_expiration. | 1254 | * Tracepoint for stimer_expiration. |
| 1255 | */ | 1255 | */ |
| 1256 | TRACE_EVENT(kvm_hv_stimer_expiration, | 1256 | TRACE_EVENT(kvm_hv_stimer_expiration, |
| 1257 | TP_PROTO(int vcpu_id, int timer_index, int msg_send_result), | 1257 | TP_PROTO(int vcpu_id, int timer_index, int direct, int msg_send_result), |
| 1258 | TP_ARGS(vcpu_id, timer_index, msg_send_result), | 1258 | TP_ARGS(vcpu_id, timer_index, direct, msg_send_result), |
| 1259 | 1259 | ||
| 1260 | TP_STRUCT__entry( | 1260 | TP_STRUCT__entry( |
| 1261 | __field(int, vcpu_id) | 1261 | __field(int, vcpu_id) |
| 1262 | __field(int, timer_index) | 1262 | __field(int, timer_index) |
| 1263 | __field(int, direct) | ||
| 1263 | __field(int, msg_send_result) | 1264 | __field(int, msg_send_result) |
| 1264 | ), | 1265 | ), |
| 1265 | 1266 | ||
| 1266 | TP_fast_assign( | 1267 | TP_fast_assign( |
| 1267 | __entry->vcpu_id = vcpu_id; | 1268 | __entry->vcpu_id = vcpu_id; |
| 1268 | __entry->timer_index = timer_index; | 1269 | __entry->timer_index = timer_index; |
| 1270 | __entry->direct = direct; | ||
| 1269 | __entry->msg_send_result = msg_send_result; | 1271 | __entry->msg_send_result = msg_send_result; |
| 1270 | ), | 1272 | ), |
| 1271 | 1273 | ||
| 1272 | TP_printk("vcpu_id %d timer %d msg send result %d", | 1274 | TP_printk("vcpu_id %d timer %d direct %d send result %d", |
| 1273 | __entry->vcpu_id, __entry->timer_index, | 1275 | __entry->vcpu_id, __entry->timer_index, |
| 1274 | __entry->msg_send_result) | 1276 | __entry->direct, __entry->msg_send_result) |
| 1275 | ); | 1277 | ); |
| 1276 | 1278 | ||
| 1277 | /* | 1279 | /* |
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c deleted file mode 100644 index 8d5d984541be..000000000000 --- a/arch/x86/kvm/vmx.c +++ /dev/null | |||
| @@ -1,15252 +0,0 @@ | |||
| 1 | /* | ||
| 2 | * Kernel-based Virtual Machine driver for Linux | ||
| 3 | * | ||
| 4 | * This module enables machines with Intel VT-x extensions to run virtual | ||
| 5 | * machines without emulation or binary translation. | ||
| 6 | * | ||
| 7 | * Copyright (C) 2006 Qumranet, Inc. | ||
| 8 | * Copyright 2010 Red Hat, Inc. and/or its affiliates. | ||
| 9 | * | ||
| 10 | * Authors: | ||
| 11 | * Avi Kivity <avi@qumranet.com> | ||
| 12 | * Yaniv Kamay <yaniv@qumranet.com> | ||
| 13 | * | ||
| 14 | * This work is licensed under the terms of the GNU GPL, version 2. See | ||
| 15 | * the COPYING file in the top-level directory. | ||
| 16 | * | ||
| 17 | */ | ||
| 18 | |||
| 19 | #include "irq.h" | ||
| 20 | #include "mmu.h" | ||
| 21 | #include "cpuid.h" | ||
| 22 | #include "lapic.h" | ||
| 23 | #include "hyperv.h" | ||
| 24 | |||
| 25 | #include <linux/kvm_host.h> | ||
| 26 | #include <linux/module.h> | ||
| 27 | #include <linux/kernel.h> | ||
| 28 | #include <linux/mm.h> | ||
| 29 | #include <linux/highmem.h> | ||
| 30 | #include <linux/sched.h> | ||
| 31 | #include <linux/moduleparam.h> | ||
| 32 | #include <linux/mod_devicetable.h> | ||
| 33 | #include <linux/trace_events.h> | ||
| 34 | #include <linux/slab.h> | ||
| 35 | #include <linux/tboot.h> | ||
| 36 | #include <linux/hrtimer.h> | ||
| 37 | #include <linux/frame.h> | ||
| 38 | #include <linux/nospec.h> | ||
| 39 | #include "kvm_cache_regs.h" | ||
| 40 | #include "x86.h" | ||
| 41 | |||
| 42 | #include <asm/asm.h> | ||
| 43 | #include <asm/cpu.h> | ||
| 44 | #include <asm/io.h> | ||
| 45 | #include <asm/desc.h> | ||
| 46 | #include <asm/vmx.h> | ||
| 47 | #include <asm/virtext.h> | ||
| 48 | #include <asm/mce.h> | ||
| 49 | #include <asm/fpu/internal.h> | ||
| 50 | #include <asm/perf_event.h> | ||
| 51 | #include <asm/debugreg.h> | ||
| 52 | #include <asm/kexec.h> | ||
| 53 | #include <asm/apic.h> | ||
| 54 | #include <asm/irq_remapping.h> | ||
| 55 | #include <asm/mmu_context.h> | ||
| 56 | #include <asm/spec-ctrl.h> | ||
| 57 | #include <asm/mshyperv.h> | ||
| 58 | |||
| 59 | #include "trace.h" | ||
| 60 | #include "pmu.h" | ||
| 61 | #include "vmx_evmcs.h" | ||
| 62 | |||
| 63 | #define __ex(x) __kvm_handle_fault_on_reboot(x) | ||
| 64 | #define __ex_clear(x, reg) \ | ||
| 65 | ____kvm_handle_fault_on_reboot(x, "xor " reg ", " reg) | ||
| 66 | |||
| 67 | MODULE_AUTHOR("Qumranet"); | ||
| 68 | MODULE_LICENSE("GPL"); | ||
| 69 | |||
| 70 | static const struct x86_cpu_id vmx_cpu_id[] = { | ||
| 71 | X86_FEATURE_MATCH(X86_FEATURE_VMX), | ||
| 72 | {} | ||
| 73 | }; | ||
| 74 | MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id); | ||
| 75 | |||
| 76 | static bool __read_mostly enable_vpid = 1; | ||
| 77 | module_param_named(vpid, enable_vpid, bool, 0444); | ||
| 78 | |||
| 79 | static bool __read_mostly enable_vnmi = 1; | ||
| 80 | module_param_named(vnmi, enable_vnmi, bool, S_IRUGO); | ||
| 81 | |||
| 82 | static bool __read_mostly flexpriority_enabled = 1; | ||
| 83 | module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO); | ||
| 84 | |||
| 85 | static bool __read_mostly enable_ept = 1; | ||
| 86 | module_param_named(ept, enable_ept, bool, S_IRUGO); | ||
| 87 | |||
| 88 | static bool __read_mostly enable_unrestricted_guest = 1; | ||
| 89 | module_param_named(unrestricted_guest, | ||
| 90 | enable_unrestricted_guest, bool, S_IRUGO); | ||
| 91 | |||
| 92 | static bool __read_mostly enable_ept_ad_bits = 1; | ||
| 93 | module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO); | ||
| 94 | |||
| 95 | static bool __read_mostly emulate_invalid_guest_state = true; | ||
| 96 | module_param(emulate_invalid_guest_state, bool, S_IRUGO); | ||
| 97 | |||
| 98 | static bool __read_mostly fasteoi = 1; | ||
| 99 | module_param(fasteoi, bool, S_IRUGO); | ||
| 100 | |||
| 101 | static bool __read_mostly enable_apicv = 1; | ||
| 102 | module_param(enable_apicv, bool, S_IRUGO); | ||
| 103 | |||
| 104 | static bool __read_mostly enable_shadow_vmcs = 1; | ||
| 105 | module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO); | ||
| 106 | /* | ||
| 107 | * If nested=1, nested virtualization is supported, i.e., guests may use | ||
| 108 | * VMX and be a hypervisor for its own guests. If nested=0, guests may not | ||
| 109 | * use VMX instructions. | ||
| 110 | */ | ||
| 111 | static bool __read_mostly nested = 1; | ||
| 112 | module_param(nested, bool, S_IRUGO); | ||
| 113 | |||
| 114 | static bool __read_mostly nested_early_check = 0; | ||
| 115 | module_param(nested_early_check, bool, S_IRUGO); | ||
| 116 | |||
| 117 | static u64 __read_mostly host_xss; | ||
| 118 | |||
| 119 | static bool __read_mostly enable_pml = 1; | ||
| 120 | module_param_named(pml, enable_pml, bool, S_IRUGO); | ||
| 121 | |||
| 122 | #define MSR_TYPE_R 1 | ||
| 123 | #define MSR_TYPE_W 2 | ||
| 124 | #define MSR_TYPE_RW 3 | ||
| 125 | |||
| 126 | #define MSR_BITMAP_MODE_X2APIC 1 | ||
| 127 | #define MSR_BITMAP_MODE_X2APIC_APICV 2 | ||
| 128 | |||
| 129 | #define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL | ||
| 130 | |||
| 131 | /* Guest_tsc -> host_tsc conversion requires 64-bit division. */ | ||
| 132 | static int __read_mostly cpu_preemption_timer_multi; | ||
| 133 | static bool __read_mostly enable_preemption_timer = 1; | ||
| 134 | #ifdef CONFIG_X86_64 | ||
| 135 | module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO); | ||
| 136 | #endif | ||
| 137 | |||
| 138 | #define KVM_VM_CR0_ALWAYS_OFF (X86_CR0_NW | X86_CR0_CD) | ||
| 139 | #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE | ||
| 140 | #define KVM_VM_CR0_ALWAYS_ON \ | ||
| 141 | (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | \ | ||
| 142 | X86_CR0_WP | X86_CR0_PG | X86_CR0_PE) | ||
| 143 | #define KVM_CR4_GUEST_OWNED_BITS \ | ||
| 144 | (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \ | ||
| 145 | | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_TSD) | ||
| 146 | |||
| 147 | #define KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR4_VMXE | ||
| 148 | #define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE) | ||
| 149 | #define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE) | ||
| 150 | |||
| 151 | #define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM)) | ||
| 152 | |||
| 153 | #define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5 | ||
| 154 | |||
| 155 | /* | ||
| 156 | * Hyper-V requires all of these, so mark them as supported even though | ||
| 157 | * they are just treated the same as all-context. | ||
| 158 | */ | ||
| 159 | #define VMX_VPID_EXTENT_SUPPORTED_MASK \ | ||
| 160 | (VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT | \ | ||
| 161 | VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT | \ | ||
| 162 | VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT | \ | ||
| 163 | VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT) | ||
| 164 | |||
| 165 | /* | ||
| 166 | * These 2 parameters are used to config the controls for Pause-Loop Exiting: | ||
| 167 | * ple_gap: upper bound on the amount of time between two successive | ||
| 168 | * executions of PAUSE in a loop. Also indicate if ple enabled. | ||
| 169 | * According to test, this time is usually smaller than 128 cycles. | ||
| 170 | * ple_window: upper bound on the amount of time a guest is allowed to execute | ||
| 171 | * in a PAUSE loop. Tests indicate that most spinlocks are held for | ||
| 172 | * less than 2^12 cycles | ||
| 173 | * Time is measured based on a counter that runs at the same rate as the TSC, | ||
| 174 | * refer SDM volume 3b section 21.6.13 & 22.1.3. | ||
| 175 | */ | ||
| 176 | static unsigned int ple_gap = KVM_DEFAULT_PLE_GAP; | ||
| 177 | module_param(ple_gap, uint, 0444); | ||
| 178 | |||
| 179 | static unsigned int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW; | ||
| 180 | module_param(ple_window, uint, 0444); | ||
| 181 | |||
| 182 | /* Default doubles per-vcpu window every exit. */ | ||
| 183 | static unsigned int ple_window_grow = KVM_DEFAULT_PLE_WINDOW_GROW; | ||
| 184 | module_param(ple_window_grow, uint, 0444); | ||
| 185 | |||
| 186 | /* Default resets per-vcpu window every exit to ple_window. */ | ||
| 187 | static unsigned int ple_window_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK; | ||
| 188 | module_param(ple_window_shrink, uint, 0444); | ||
| 189 | |||
| 190 | /* Default is to compute the maximum so we can never overflow. */ | ||
| 191 | static unsigned int ple_window_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX; | ||
| 192 | module_param(ple_window_max, uint, 0444); | ||
| 193 | |||
| 194 | extern const ulong vmx_return; | ||
| 195 | extern const ulong vmx_early_consistency_check_return; | ||
| 196 | |||
| 197 | static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush); | ||
| 198 | static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond); | ||
| 199 | static DEFINE_MUTEX(vmx_l1d_flush_mutex); | ||
| 200 | |||
| 201 | /* Storage for pre module init parameter parsing */ | ||
| 202 | static enum vmx_l1d_flush_state __read_mostly vmentry_l1d_flush_param = VMENTER_L1D_FLUSH_AUTO; | ||
| 203 | |||
| 204 | static const struct { | ||
| 205 | const char *option; | ||
| 206 | bool for_parse; | ||
| 207 | } vmentry_l1d_param[] = { | ||
| 208 | [VMENTER_L1D_FLUSH_AUTO] = {"auto", true}, | ||
| 209 | [VMENTER_L1D_FLUSH_NEVER] = {"never", true}, | ||
| 210 | [VMENTER_L1D_FLUSH_COND] = {"cond", true}, | ||
| 211 | [VMENTER_L1D_FLUSH_ALWAYS] = {"always", true}, | ||
| 212 | [VMENTER_L1D_FLUSH_EPT_DISABLED] = {"EPT disabled", false}, | ||
| 213 | [VMENTER_L1D_FLUSH_NOT_REQUIRED] = {"not required", false}, | ||
| 214 | }; | ||
| 215 | |||
| 216 | #define L1D_CACHE_ORDER 4 | ||
| 217 | static void *vmx_l1d_flush_pages; | ||
| 218 | |||
| 219 | static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf) | ||
| 220 | { | ||
| 221 | struct page *page; | ||
| 222 | unsigned int i; | ||
| 223 | |||
| 224 | if (!enable_ept) { | ||
| 225 | l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED; | ||
| 226 | return 0; | ||
| 227 | } | ||
| 228 | |||
| 229 | if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) { | ||
| 230 | u64 msr; | ||
| 231 | |||
| 232 | rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr); | ||
| 233 | if (msr & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) { | ||
| 234 | l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED; | ||
| 235 | return 0; | ||
| 236 | } | ||
| 237 | } | ||
| 238 | |||
| 239 | /* If set to auto use the default l1tf mitigation method */ | ||
| 240 | if (l1tf == VMENTER_L1D_FLUSH_AUTO) { | ||
| 241 | switch (l1tf_mitigation) { | ||
| 242 | case L1TF_MITIGATION_OFF: | ||
| 243 | l1tf = VMENTER_L1D_FLUSH_NEVER; | ||
| 244 | break; | ||
| 245 | case L1TF_MITIGATION_FLUSH_NOWARN: | ||
| 246 | case L1TF_MITIGATION_FLUSH: | ||
| 247 | case L1TF_MITIGATION_FLUSH_NOSMT: | ||
| 248 | l1tf = VMENTER_L1D_FLUSH_COND; | ||
| 249 | break; | ||
| 250 | case L1TF_MITIGATION_FULL: | ||
| 251 | case L1TF_MITIGATION_FULL_FORCE: | ||
| 252 | l1tf = VMENTER_L1D_FLUSH_ALWAYS; | ||
| 253 | break; | ||
| 254 | } | ||
| 255 | } else if (l1tf_mitigation == L1TF_MITIGATION_FULL_FORCE) { | ||
| 256 | l1tf = VMENTER_L1D_FLUSH_ALWAYS; | ||
| 257 | } | ||
| 258 | |||
| 259 | if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages && | ||
| 260 | !boot_cpu_has(X86_FEATURE_FLUSH_L1D)) { | ||
| 261 | page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER); | ||
| 262 | if (!page) | ||
| 263 | return -ENOMEM; | ||
| 264 | vmx_l1d_flush_pages = page_address(page); | ||
| 265 | |||
| 266 | /* | ||
| 267 | * Initialize each page with a different pattern in | ||
| 268 | * order to protect against KSM in the nested | ||
| 269 | * virtualization case. | ||
| 270 | */ | ||
| 271 | for (i = 0; i < 1u << L1D_CACHE_ORDER; ++i) { | ||
| 272 | memset(vmx_l1d_flush_pages + i * PAGE_SIZE, i + 1, | ||
| 273 | PAGE_SIZE); | ||
| 274 | } | ||
| 275 | } | ||
| 276 | |||
| 277 | l1tf_vmx_mitigation = l1tf; | ||
| 278 | |||
| 279 | if (l1tf != VMENTER_L1D_FLUSH_NEVER) | ||
| 280 | static_branch_enable(&vmx_l1d_should_flush); | ||
| 281 | else | ||
| 282 | static_branch_disable(&vmx_l1d_should_flush); | ||
| 283 | |||
| 284 | if (l1tf == VMENTER_L1D_FLUSH_COND) | ||
| 285 | static_branch_enable(&vmx_l1d_flush_cond); | ||
| 286 | else | ||
| 287 | static_branch_disable(&vmx_l1d_flush_cond); | ||
| 288 | return 0; | ||
| 289 | } | ||
| 290 | |||
| 291 | static int vmentry_l1d_flush_parse(const char *s) | ||
| 292 | { | ||
| 293 | unsigned int i; | ||
| 294 | |||
| 295 | if (s) { | ||
| 296 | for (i = 0; i < ARRAY_SIZE(vmentry_l1d_param); i++) { | ||
| 297 | if (vmentry_l1d_param[i].for_parse && | ||
| 298 | sysfs_streq(s, vmentry_l1d_param[i].option)) | ||
| 299 | return i; | ||
| 300 | } | ||
| 301 | } | ||
| 302 | return -EINVAL; | ||
| 303 | } | ||
| 304 | |||
| 305 | static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp) | ||
| 306 | { | ||
| 307 | int l1tf, ret; | ||
| 308 | |||
| 309 | l1tf = vmentry_l1d_flush_parse(s); | ||
| 310 | if (l1tf < 0) | ||
| 311 | return l1tf; | ||
| 312 | |||
| 313 | if (!boot_cpu_has(X86_BUG_L1TF)) | ||
| 314 | return 0; | ||
| 315 | |||
| 316 | /* | ||
| 317 | * Has vmx_init() run already? If not then this is the pre init | ||
| 318 | * parameter parsing. In that case just store the value and let | ||
| 319 | * vmx_init() do the proper setup after enable_ept has been | ||
| 320 | * established. | ||
| 321 | */ | ||
| 322 | if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO) { | ||
| 323 | vmentry_l1d_flush_param = l1tf; | ||
| 324 | return 0; | ||
| 325 | } | ||
| 326 | |||
| 327 | mutex_lock(&vmx_l1d_flush_mutex); | ||
| 328 | ret = vmx_setup_l1d_flush(l1tf); | ||
| 329 | mutex_unlock(&vmx_l1d_flush_mutex); | ||
| 330 | return ret; | ||
| 331 | } | ||
| 332 | |||
| 333 | static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp) | ||
| 334 | { | ||
| 335 | if (WARN_ON_ONCE(l1tf_vmx_mitigation >= ARRAY_SIZE(vmentry_l1d_param))) | ||
| 336 | return sprintf(s, "???\n"); | ||
| 337 | |||
| 338 | return sprintf(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option); | ||
| 339 | } | ||
| 340 | |||
| 341 | static const struct kernel_param_ops vmentry_l1d_flush_ops = { | ||
| 342 | .set = vmentry_l1d_flush_set, | ||
| 343 | .get = vmentry_l1d_flush_get, | ||
| 344 | }; | ||
| 345 | module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644); | ||
| 346 | |||
| 347 | enum ept_pointers_status { | ||
| 348 | EPT_POINTERS_CHECK = 0, | ||
| 349 | EPT_POINTERS_MATCH = 1, | ||
| 350 | EPT_POINTERS_MISMATCH = 2 | ||
| 351 | }; | ||
| 352 | |||
| 353 | struct kvm_vmx { | ||
| 354 | struct kvm kvm; | ||
| 355 | |||
| 356 | unsigned int tss_addr; | ||
| 357 | bool ept_identity_pagetable_done; | ||
| 358 | gpa_t ept_identity_map_addr; | ||
| 359 | |||
| 360 | enum ept_pointers_status ept_pointers_match; | ||
| 361 | spinlock_t ept_pointer_lock; | ||
| 362 | }; | ||
| 363 | |||
| 364 | #define NR_AUTOLOAD_MSRS 8 | ||
| 365 | |||
| 366 | struct vmcs_hdr { | ||
| 367 | u32 revision_id:31; | ||
| 368 | u32 shadow_vmcs:1; | ||
| 369 | }; | ||
| 370 | |||
| 371 | struct vmcs { | ||
| 372 | struct vmcs_hdr hdr; | ||
| 373 | u32 abort; | ||
| 374 | char data[0]; | ||
| 375 | }; | ||
| 376 | |||
| 377 | /* | ||
| 378 | * vmcs_host_state tracks registers that are loaded from the VMCS on VMEXIT | ||
| 379 | * and whose values change infrequently, but are not constant. I.e. this is | ||
| 380 | * used as a write-through cache of the corresponding VMCS fields. | ||
| 381 | */ | ||
| 382 | struct vmcs_host_state { | ||
| 383 | unsigned long cr3; /* May not match real cr3 */ | ||
| 384 | unsigned long cr4; /* May not match real cr4 */ | ||
| 385 | unsigned long gs_base; | ||
| 386 | unsigned long fs_base; | ||
| 387 | |||
| 388 | u16 fs_sel, gs_sel, ldt_sel; | ||
| 389 | #ifdef CONFIG_X86_64 | ||
| 390 | u16 ds_sel, es_sel; | ||
| 391 | #endif | ||
| 392 | }; | ||
| 393 | |||
| 394 | /* | ||
| 395 | * Track a VMCS that may be loaded on a certain CPU. If it is (cpu!=-1), also | ||
| 396 | * remember whether it was VMLAUNCHed, and maintain a linked list of all VMCSs | ||
| 397 | * loaded on this CPU (so we can clear them if the CPU goes down). | ||
| 398 | */ | ||
| 399 | struct loaded_vmcs { | ||
| 400 | struct vmcs *vmcs; | ||
| 401 | struct vmcs *shadow_vmcs; | ||
| 402 | int cpu; | ||
| 403 | bool launched; | ||
| 404 | bool nmi_known_unmasked; | ||
| 405 | bool hv_timer_armed; | ||
| 406 | /* Support for vnmi-less CPUs */ | ||
| 407 | int soft_vnmi_blocked; | ||
| 408 | ktime_t entry_time; | ||
| 409 | s64 vnmi_blocked_time; | ||
| 410 | unsigned long *msr_bitmap; | ||
| 411 | struct list_head loaded_vmcss_on_cpu_link; | ||
| 412 | struct vmcs_host_state host_state; | ||
| 413 | }; | ||
| 414 | |||
| 415 | struct shared_msr_entry { | ||
| 416 | unsigned index; | ||
| 417 | u64 data; | ||
| 418 | u64 mask; | ||
| 419 | }; | ||
| 420 | |||
| 421 | /* | ||
| 422 | * struct vmcs12 describes the state that our guest hypervisor (L1) keeps for a | ||
| 423 | * single nested guest (L2), hence the name vmcs12. Any VMX implementation has | ||
| 424 | * a VMCS structure, and vmcs12 is our emulated VMX's VMCS. This structure is | ||
| 425 | * stored in guest memory specified by VMPTRLD, but is opaque to the guest, | ||
| 426 | * which must access it using VMREAD/VMWRITE/VMCLEAR instructions. | ||
| 427 | * More than one of these structures may exist, if L1 runs multiple L2 guests. | ||
| 428 | * nested_vmx_run() will use the data here to build the vmcs02: a VMCS for the | ||
| 429 | * underlying hardware which will be used to run L2. | ||
| 430 | * This structure is packed to ensure that its layout is identical across | ||
| 431 | * machines (necessary for live migration). | ||
| 432 | * | ||
| 433 | * IMPORTANT: Changing the layout of existing fields in this structure | ||
| 434 | * will break save/restore compatibility with older kvm releases. When | ||
| 435 | * adding new fields, either use space in the reserved padding* arrays | ||
| 436 | * or add the new fields to the end of the structure. | ||
| 437 | */ | ||
| 438 | typedef u64 natural_width; | ||
| 439 | struct __packed vmcs12 { | ||
| 440 | /* According to the Intel spec, a VMCS region must start with the | ||
| 441 | * following two fields. Then follow implementation-specific data. | ||
| 442 | */ | ||
| 443 | struct vmcs_hdr hdr; | ||
| 444 | u32 abort; | ||
| 445 | |||
| 446 | u32 launch_state; /* set to 0 by VMCLEAR, to 1 by VMLAUNCH */ | ||
| 447 | u32 padding[7]; /* room for future expansion */ | ||
| 448 | |||
| 449 | u64 io_bitmap_a; | ||
| 450 | u64 io_bitmap_b; | ||
| 451 | u64 msr_bitmap; | ||
| 452 | u64 vm_exit_msr_store_addr; | ||
| 453 | u64 vm_exit_msr_load_addr; | ||
| 454 | u64 vm_entry_msr_load_addr; | ||
| 455 | u64 tsc_offset; | ||
| 456 | u64 virtual_apic_page_addr; | ||
| 457 | u64 apic_access_addr; | ||
| 458 | u64 posted_intr_desc_addr; | ||
| 459 | u64 ept_pointer; | ||
| 460 | u64 eoi_exit_bitmap0; | ||
| 461 | u64 eoi_exit_bitmap1; | ||
| 462 | u64 eoi_exit_bitmap2; | ||
| 463 | u64 eoi_exit_bitmap3; | ||
| 464 | u64 xss_exit_bitmap; | ||
| 465 | u64 guest_physical_address; | ||
| 466 | u64 vmcs_link_pointer; | ||
| 467 | u64 guest_ia32_debugctl; | ||
| 468 | u64 guest_ia32_pat; | ||
| 469 | u64 guest_ia32_efer; | ||
| 470 | u64 guest_ia32_perf_global_ctrl; | ||
| 471 | u64 guest_pdptr0; | ||
| 472 | u64 guest_pdptr1; | ||
| 473 | u64 guest_pdptr2; | ||
| 474 | u64 guest_pdptr3; | ||
| 475 | u64 guest_bndcfgs; | ||
| 476 | u64 host_ia32_pat; | ||
| 477 | u64 host_ia32_efer; | ||
| 478 | u64 host_ia32_perf_global_ctrl; | ||
| 479 | u64 vmread_bitmap; | ||
| 480 | u64 vmwrite_bitmap; | ||
| 481 | u64 vm_function_control; | ||
| 482 | u64 eptp_list_address; | ||
| 483 | u64 pml_address; | ||
| 484 | u64 padding64[3]; /* room for future expansion */ | ||
| 485 | /* | ||
| 486 | * To allow migration of L1 (complete with its L2 guests) between | ||
| 487 | * machines of different natural widths (32 or 64 bit), we cannot have | ||
| 488 | * unsigned long fields with no explict size. We use u64 (aliased | ||
| 489 | * natural_width) instead. Luckily, x86 is little-endian. | ||
| 490 | */ | ||
| 491 | natural_width cr0_guest_host_mask; | ||
| 492 | natural_width cr4_guest_host_mask; | ||
| 493 | natural_width cr0_read_shadow; | ||
| 494 | natural_width cr4_read_shadow; | ||
| 495 | natural_width cr3_target_value0; | ||
| 496 | natural_width cr3_target_value1; | ||
| 497 | natural_width cr3_target_value2; | ||
| 498 | natural_width cr3_target_value3; | ||
| 499 | natural_width exit_qualification; | ||
| 500 | natural_width guest_linear_address; | ||
| 501 | natural_width guest_cr0; | ||
| 502 | natural_width guest_cr3; | ||
| 503 | natural_width guest_cr4; | ||
| 504 | natural_width guest_es_base; | ||
| 505 | natural_width guest_cs_base; | ||
| 506 | natural_width guest_ss_base; | ||
| 507 | natural_width guest_ds_base; | ||
| 508 | natural_width guest_fs_base; | ||
| 509 | natural_width guest_gs_base; | ||
| 510 | natural_width guest_ldtr_base; | ||
| 511 | natural_width guest_tr_base; | ||
| 512 | natural_width guest_gdtr_base; | ||
| 513 | natural_width guest_idtr_base; | ||
| 514 | natural_width guest_dr7; | ||
| 515 | natural_width guest_rsp; | ||
| 516 | natural_width guest_rip; | ||
| 517 | natural_width guest_rflags; | ||
| 518 | natural_width guest_pending_dbg_exceptions; | ||
| 519 | natural_width guest_sysenter_esp; | ||
| 520 | natural_width guest_sysenter_eip; | ||
| 521 | natural_width host_cr0; | ||
| 522 | natural_width host_cr3; | ||
| 523 | natural_width host_cr4; | ||
| 524 | natural_width host_fs_base; | ||
| 525 | natural_width host_gs_base; | ||
| 526 | natural_width host_tr_base; | ||
| 527 | natural_width host_gdtr_base; | ||
| 528 | natural_width host_idtr_base; | ||
| 529 | natural_width host_ia32_sysenter_esp; | ||
| 530 | natural_width host_ia32_sysenter_eip; | ||
| 531 | natural_width host_rsp; | ||
| 532 | natural_width host_rip; | ||
| 533 | natural_width paddingl[8]; /* room for future expansion */ | ||
| 534 | u32 pin_based_vm_exec_control; | ||
| 535 | u32 cpu_based_vm_exec_control; | ||
| 536 | u32 exception_bitmap; | ||
| 537 | u32 page_fault_error_code_mask; | ||
| 538 | u32 page_fault_error_code_match; | ||
| 539 | u32 cr3_target_count; | ||
| 540 | u32 vm_exit_controls; | ||
| 541 | u32 vm_exit_msr_store_count; | ||
| 542 | u32 vm_exit_msr_load_count; | ||
| 543 | u32 vm_entry_controls; | ||
| 544 | u32 vm_entry_msr_load_count; | ||
| 545 | u32 vm_entry_intr_info_field; | ||
| 546 | u32 vm_entry_exception_error_code; | ||
| 547 | u32 vm_entry_instruction_len; | ||
| 548 | u32 tpr_threshold; | ||
| 549 | u32 secondary_vm_exec_control; | ||
| 550 | u32 vm_instruction_error; | ||
| 551 | u32 vm_exit_reason; | ||
| 552 | u32 vm_exit_intr_info; | ||
| 553 | u32 vm_exit_intr_error_code; | ||
| 554 | u32 idt_vectoring_info_field; | ||
| 555 | u32 idt_vectoring_error_code; | ||
| 556 | u32 vm_exit_instruction_len; | ||
| 557 | u32 vmx_instruction_info; | ||
| 558 | u32 guest_es_limit; | ||
| 559 | u32 guest_cs_limit; | ||
| 560 | u32 guest_ss_limit; | ||
| 561 | u32 guest_ds_limit; | ||
| 562 | u32 guest_fs_limit; | ||
| 563 | u32 guest_gs_limit; | ||
| 564 | u32 guest_ldtr_limit; | ||
| 565 | u32 guest_tr_limit; | ||
| 566 | u32 guest_gdtr_limit; | ||
| 567 | u32 guest_idtr_limit; | ||
| 568 | u32 guest_es_ar_bytes; | ||
| 569 | u32 guest_cs_ar_bytes; | ||
| 570 | u32 guest_ss_ar_bytes; | ||
| 571 | u32 guest_ds_ar_bytes; | ||
| 572 | u32 guest_fs_ar_bytes; | ||
| 573 | u32 guest_gs_ar_bytes; | ||
| 574 | u32 guest_ldtr_ar_bytes; | ||
| 575 | u32 guest_tr_ar_bytes; | ||
| 576 | u32 guest_interruptibility_info; | ||
| 577 | u32 guest_activity_state; | ||
| 578 | u32 guest_sysenter_cs; | ||
| 579 | u32 host_ia32_sysenter_cs; | ||
| 580 | u32 vmx_preemption_timer_value; | ||
| 581 | u32 padding32[7]; /* room for future expansion */ | ||
| 582 | u16 virtual_processor_id; | ||
| 583 | u16 posted_intr_nv; | ||
| 584 | u16 guest_es_selector; | ||
| 585 | u16 guest_cs_selector; | ||
| 586 | u16 guest_ss_selector; | ||
| 587 | u16 guest_ds_selector; | ||
| 588 | u16 guest_fs_selector; | ||
| 589 | u16 guest_gs_selector; | ||
| 590 | u16 guest_ldtr_selector; | ||
| 591 | u16 guest_tr_selector; | ||
| 592 | u16 guest_intr_status; | ||
| 593 | u16 host_es_selector; | ||
| 594 | u16 host_cs_selector; | ||
| 595 | u16 host_ss_selector; | ||
| 596 | u16 host_ds_selector; | ||
| 597 | u16 host_fs_selector; | ||
| 598 | u16 host_gs_selector; | ||
| 599 | u16 host_tr_selector; | ||
| 600 | u16 guest_pml_index; | ||
| 601 | }; | ||
| 602 | |||
| 603 | /* | ||
| 604 | * For save/restore compatibility, the vmcs12 field offsets must not change. | ||
| 605 | */ | ||
| 606 | #define CHECK_OFFSET(field, loc) \ | ||
| 607 | BUILD_BUG_ON_MSG(offsetof(struct vmcs12, field) != (loc), \ | ||
| 608 | "Offset of " #field " in struct vmcs12 has changed.") | ||
| 609 | |||
| 610 | static inline void vmx_check_vmcs12_offsets(void) { | ||
| 611 | CHECK_OFFSET(hdr, 0); | ||
| 612 | CHECK_OFFSET(abort, 4); | ||
| 613 | CHECK_OFFSET(launch_state, 8); | ||
| 614 | CHECK_OFFSET(io_bitmap_a, 40); | ||
| 615 | CHECK_OFFSET(io_bitmap_b, 48); | ||
| 616 | CHECK_OFFSET(msr_bitmap, 56); | ||
| 617 | CHECK_OFFSET(vm_exit_msr_store_addr, 64); | ||
| 618 | CHECK_OFFSET(vm_exit_msr_load_addr, 72); | ||
| 619 | CHECK_OFFSET(vm_entry_msr_load_addr, 80); | ||
| 620 | CHECK_OFFSET(tsc_offset, 88); | ||
| 621 | CHECK_OFFSET(virtual_apic_page_addr, 96); | ||
| 622 | CHECK_OFFSET(apic_access_addr, 104); | ||
| 623 | CHECK_OFFSET(posted_intr_desc_addr, 112); | ||
| 624 | CHECK_OFFSET(ept_pointer, 120); | ||
| 625 | CHECK_OFFSET(eoi_exit_bitmap0, 128); | ||
| 626 | CHECK_OFFSET(eoi_exit_bitmap1, 136); | ||
| 627 | CHECK_OFFSET(eoi_exit_bitmap2, 144); | ||
| 628 | CHECK_OFFSET(eoi_exit_bitmap3, 152); | ||
| 629 | CHECK_OFFSET(xss_exit_bitmap, 160); | ||
| 630 | CHECK_OFFSET(guest_physical_address, 168); | ||
| 631 | CHECK_OFFSET(vmcs_link_pointer, 176); | ||
| 632 | CHECK_OFFSET(guest_ia32_debugctl, 184); | ||
| 633 | CHECK_OFFSET(guest_ia32_pat, 192); | ||
| 634 | CHECK_OFFSET(guest_ia32_efer, 200); | ||
| 635 | CHECK_OFFSET(guest_ia32_perf_global_ctrl, 208); | ||
| 636 | CHECK_OFFSET(guest_pdptr0, 216); | ||
| 637 | CHECK_OFFSET(guest_pdptr1, 224); | ||
| 638 | CHECK_OFFSET(guest_pdptr2, 232); | ||
| 639 | CHECK_OFFSET(guest_pdptr3, 240); | ||
| 640 | CHECK_OFFSET(guest_bndcfgs, 248); | ||
| 641 | CHECK_OFFSET(host_ia32_pat, 256); | ||
| 642 | CHECK_OFFSET(host_ia32_efer, 264); | ||
| 643 | CHECK_OFFSET(host_ia32_perf_global_ctrl, 272); | ||
| 644 | CHECK_OFFSET(vmread_bitmap, 280); | ||
| 645 | CHECK_OFFSET(vmwrite_bitmap, 288); | ||
| 646 | CHECK_OFFSET(vm_function_control, 296); | ||
| 647 | CHECK_OFFSET(eptp_list_address, 304); | ||
| 648 | CHECK_OFFSET(pml_address, 312); | ||
| 649 | CHECK_OFFSET(cr0_guest_host_mask, 344); | ||
| 650 | CHECK_OFFSET(cr4_guest_host_mask, 352); | ||
| 651 | CHECK_OFFSET(cr0_read_shadow, 360); | ||
| 652 | CHECK_OFFSET(cr4_read_shadow, 368); | ||
| 653 | CHECK_OFFSET(cr3_target_value0, 376); | ||
| 654 | CHECK_OFFSET(cr3_target_value1, 384); | ||
| 655 | CHECK_OFFSET(cr3_target_value2, 392); | ||
| 656 | CHECK_OFFSET(cr3_target_value3, 400); | ||
| 657 | CHECK_OFFSET(exit_qualification, 408); | ||
| 658 | CHECK_OFFSET(guest_linear_address, 416); | ||
| 659 | CHECK_OFFSET(guest_cr0, 424); | ||
| 660 | CHECK_OFFSET(guest_cr3, 432); | ||
| 661 | CHECK_OFFSET(guest_cr4, 440); | ||
| 662 | CHECK_OFFSET(guest_es_base, 448); | ||
| 663 | CHECK_OFFSET(guest_cs_base, 456); | ||
| 664 | CHECK_OFFSET(guest_ss_base, 464); | ||
| 665 | CHECK_OFFSET(guest_ds_base, 472); | ||
| 666 | CHECK_OFFSET(guest_fs_base, 480); | ||
| 667 | CHECK_OFFSET(guest_gs_base, 488); | ||
| 668 | CHECK_OFFSET(guest_ldtr_base, 496); | ||
| 669 | CHECK_OFFSET(guest_tr_base, 504); | ||
| 670 | CHECK_OFFSET(guest_gdtr_base, 512); | ||
| 671 | CHECK_OFFSET(guest_idtr_base, 520); | ||
| 672 | CHECK_OFFSET(guest_dr7, 528); | ||
| 673 | CHECK_OFFSET(guest_rsp, 536); | ||
| 674 | CHECK_OFFSET(guest_rip, 544); | ||
| 675 | CHECK_OFFSET(guest_rflags, 552); | ||
| 676 | CHECK_OFFSET(guest_pending_dbg_exceptions, 560); | ||
| 677 | CHECK_OFFSET(guest_sysenter_esp, 568); | ||
| 678 | CHECK_OFFSET(guest_sysenter_eip, 576); | ||
| 679 | CHECK_OFFSET(host_cr0, 584); | ||
| 680 | CHECK_OFFSET(host_cr3, 592); | ||
| 681 | CHECK_OFFSET(host_cr4, 600); | ||
| 682 | CHECK_OFFSET(host_fs_base, 608); | ||
| 683 | CHECK_OFFSET(host_gs_base, 616); | ||
| 684 | CHECK_OFFSET(host_tr_base, 624); | ||
| 685 | CHECK_OFFSET(host_gdtr_base, 632); | ||
| 686 | CHECK_OFFSET(host_idtr_base, 640); | ||
| 687 | CHECK_OFFSET(host_ia32_sysenter_esp, 648); | ||
| 688 | CHECK_OFFSET(host_ia32_sysenter_eip, 656); | ||
| 689 | CHECK_OFFSET(host_rsp, 664); | ||
| 690 | CHECK_OFFSET(host_rip, 672); | ||
| 691 | CHECK_OFFSET(pin_based_vm_exec_control, 744); | ||
| 692 | CHECK_OFFSET(cpu_based_vm_exec_control, 748); | ||
| 693 | CHECK_OFFSET(exception_bitmap, 752); | ||
| 694 | CHECK_OFFSET(page_fault_error_code_mask, 756); | ||
| 695 | CHECK_OFFSET(page_fault_error_code_match, 760); | ||
| 696 | CHECK_OFFSET(cr3_target_count, 764); | ||
| 697 | CHECK_OFFSET(vm_exit_controls, 768); | ||
| 698 | CHECK_OFFSET(vm_exit_msr_store_count, 772); | ||
| 699 | CHECK_OFFSET(vm_exit_msr_load_count, 776); | ||
| 700 | CHECK_OFFSET(vm_entry_controls, 780); | ||
| 701 | CHECK_OFFSET(vm_entry_msr_load_count, 784); | ||
| 702 | CHECK_OFFSET(vm_entry_intr_info_field, 788); | ||
| 703 | CHECK_OFFSET(vm_entry_exception_error_code, 792); | ||
| 704 | CHECK_OFFSET(vm_entry_instruction_len, 796); | ||
| 705 | CHECK_OFFSET(tpr_threshold, 800); | ||
| 706 | CHECK_OFFSET(secondary_vm_exec_control, 804); | ||
| 707 | CHECK_OFFSET(vm_instruction_error, 808); | ||
| 708 | CHECK_OFFSET(vm_exit_reason, 812); | ||
| 709 | CHECK_OFFSET(vm_exit_intr_info, 816); | ||
| 710 | CHECK_OFFSET(vm_exit_intr_error_code, 820); | ||
| 711 | CHECK_OFFSET(idt_vectoring_info_field, 824); | ||
| 712 | CHECK_OFFSET(idt_vectoring_error_code, 828); | ||
| 713 | CHECK_OFFSET(vm_exit_instruction_len, 832); | ||
| 714 | CHECK_OFFSET(vmx_instruction_info, 836); | ||
| 715 | CHECK_OFFSET(guest_es_limit, 840); | ||
| 716 | CHECK_OFFSET(guest_cs_limit, 844); | ||
| 717 | CHECK_OFFSET(guest_ss_limit, 848); | ||
| 718 | CHECK_OFFSET(guest_ds_limit, 852); | ||
| 719 | CHECK_OFFSET(guest_fs_limit, 856); | ||
| 720 | CHECK_OFFSET(guest_gs_limit, 860); | ||
| 721 | CHECK_OFFSET(guest_ldtr_limit, 864); | ||
| 722 | CHECK_OFFSET(guest_tr_limit, 868); | ||
| 723 | CHECK_OFFSET(guest_gdtr_limit, 872); | ||
| 724 | CHECK_OFFSET(guest_idtr_limit, 876); | ||
| 725 | CHECK_OFFSET(guest_es_ar_bytes, 880); | ||
| 726 | CHECK_OFFSET(guest_cs_ar_bytes, 884); | ||
| 727 | CHECK_OFFSET(guest_ss_ar_bytes, 888); | ||
| 728 | CHECK_OFFSET(guest_ds_ar_bytes, 892); | ||
| 729 | CHECK_OFFSET(guest_fs_ar_bytes, 896); | ||
| 730 | CHECK_OFFSET(guest_gs_ar_bytes, 900); | ||
| 731 | CHECK_OFFSET(guest_ldtr_ar_bytes, 904); | ||
| 732 | CHECK_OFFSET(guest_tr_ar_bytes, 908); | ||
| 733 | CHECK_OFFSET(guest_interruptibility_info, 912); | ||
| 734 | CHECK_OFFSET(guest_activity_state, 916); | ||
| 735 | CHECK_OFFSET(guest_sysenter_cs, 920); | ||
| 736 | CHECK_OFFSET(host_ia32_sysenter_cs, 924); | ||
| 737 | CHECK_OFFSET(vmx_preemption_timer_value, 928); | ||
| 738 | CHECK_OFFSET(virtual_processor_id, 960); | ||
| 739 | CHECK_OFFSET(posted_intr_nv, 962); | ||
| 740 | CHECK_OFFSET(guest_es_selector, 964); | ||
| 741 | CHECK_OFFSET(guest_cs_selector, 966); | ||
| 742 | CHECK_OFFSET(guest_ss_selector, 968); | ||
| 743 | CHECK_OFFSET(guest_ds_selector, 970); | ||
| 744 | CHECK_OFFSET(guest_fs_selector, 972); | ||
| 745 | CHECK_OFFSET(guest_gs_selector, 974); | ||
| 746 | CHECK_OFFSET(guest_ldtr_selector, 976); | ||
| 747 | CHECK_OFFSET(guest_tr_selector, 978); | ||
| 748 | CHECK_OFFSET(guest_intr_status, 980); | ||
| 749 | CHECK_OFFSET(host_es_selector, 982); | ||
| 750 | CHECK_OFFSET(host_cs_selector, 984); | ||
| 751 | CHECK_OFFSET(host_ss_selector, 986); | ||
| 752 | CHECK_OFFSET(host_ds_selector, 988); | ||
| 753 | CHECK_OFFSET(host_fs_selector, 990); | ||
| 754 | CHECK_OFFSET(host_gs_selector, 992); | ||
| 755 | CHECK_OFFSET(host_tr_selector, 994); | ||
| 756 | CHECK_OFFSET(guest_pml_index, 996); | ||
| 757 | } | ||
| 758 | |||
| 759 | /* | ||
| 760 | * VMCS12_REVISION is an arbitrary id that should be changed if the content or | ||
| 761 | * layout of struct vmcs12 is changed. MSR_IA32_VMX_BASIC returns this id, and | ||
| 762 | * VMPTRLD verifies that the VMCS region that L1 is loading contains this id. | ||
| 763 | * | ||
| 764 | * IMPORTANT: Changing this value will break save/restore compatibility with | ||
| 765 | * older kvm releases. | ||
| 766 | */ | ||
| 767 | #define VMCS12_REVISION 0x11e57ed0 | ||
| 768 | |||
| 769 | /* | ||
| 770 | * VMCS12_SIZE is the number of bytes L1 should allocate for the VMXON region | ||
| 771 | * and any VMCS region. Although only sizeof(struct vmcs12) are used by the | ||
| 772 | * current implementation, 4K are reserved to avoid future complications. | ||
| 773 | */ | ||
| 774 | #define VMCS12_SIZE 0x1000 | ||
| 775 | |||
| 776 | /* | ||
| 777 | * VMCS12_MAX_FIELD_INDEX is the highest index value used in any | ||
| 778 | * supported VMCS12 field encoding. | ||
| 779 | */ | ||
| 780 | #define VMCS12_MAX_FIELD_INDEX 0x17 | ||
| 781 | |||
| 782 | struct nested_vmx_msrs { | ||
| 783 | /* | ||
| 784 | * We only store the "true" versions of the VMX capability MSRs. We | ||
| 785 | * generate the "non-true" versions by setting the must-be-1 bits | ||
| 786 | * according to the SDM. | ||
| 787 | */ | ||
| 788 | u32 procbased_ctls_low; | ||
| 789 | u32 procbased_ctls_high; | ||
| 790 | u32 secondary_ctls_low; | ||
| 791 | u32 secondary_ctls_high; | ||
| 792 | u32 pinbased_ctls_low; | ||
| 793 | u32 pinbased_ctls_high; | ||
| 794 | u32 exit_ctls_low; | ||
| 795 | u32 exit_ctls_high; | ||
| 796 | u32 entry_ctls_low; | ||
| 797 | u32 entry_ctls_high; | ||
| 798 | u32 misc_low; | ||
| 799 | u32 misc_high; | ||
| 800 | u32 ept_caps; | ||
| 801 | u32 vpid_caps; | ||
| 802 | u64 basic; | ||
| 803 | u64 cr0_fixed0; | ||
| 804 | u64 cr0_fixed1; | ||
| 805 | u64 cr4_fixed0; | ||
| 806 | u64 cr4_fixed1; | ||
| 807 | u64 vmcs_enum; | ||
| 808 | u64 vmfunc_controls; | ||
| 809 | }; | ||
| 810 | |||
| 811 | /* | ||
| 812 | * The nested_vmx structure is part of vcpu_vmx, and holds information we need | ||
| 813 | * for correct emulation of VMX (i.e., nested VMX) on this vcpu. | ||
| 814 | */ | ||
| 815 | struct nested_vmx { | ||
| 816 | /* Has the level1 guest done vmxon? */ | ||
| 817 | bool vmxon; | ||
| 818 | gpa_t vmxon_ptr; | ||
| 819 | bool pml_full; | ||
| 820 | |||
| 821 | /* The guest-physical address of the current VMCS L1 keeps for L2 */ | ||
| 822 | gpa_t current_vmptr; | ||
| 823 | /* | ||
| 824 | * Cache of the guest's VMCS, existing outside of guest memory. | ||
| 825 | * Loaded from guest memory during VMPTRLD. Flushed to guest | ||
| 826 | * memory during VMCLEAR and VMPTRLD. | ||
| 827 | */ | ||
| 828 | struct vmcs12 *cached_vmcs12; | ||
| 829 | /* | ||
| 830 | * Cache of the guest's shadow VMCS, existing outside of guest | ||
| 831 | * memory. Loaded from guest memory during VM entry. Flushed | ||
| 832 | * to guest memory during VM exit. | ||
| 833 | */ | ||
| 834 | struct vmcs12 *cached_shadow_vmcs12; | ||
| 835 | /* | ||
| 836 | * Indicates if the shadow vmcs or enlightened vmcs must be updated | ||
| 837 | * with the data held by struct vmcs12. | ||
| 838 | */ | ||
| 839 | bool need_vmcs12_sync; | ||
| 840 | bool dirty_vmcs12; | ||
| 841 | |||
| 842 | /* | ||
| 843 | * vmcs02 has been initialized, i.e. state that is constant for | ||
| 844 | * vmcs02 has been written to the backing VMCS. Initialization | ||
| 845 | * is delayed until L1 actually attempts to run a nested VM. | ||
| 846 | */ | ||
| 847 | bool vmcs02_initialized; | ||
| 848 | |||
| 849 | bool change_vmcs01_virtual_apic_mode; | ||
| 850 | |||
| 851 | /* | ||
| 852 | * Enlightened VMCS has been enabled. It does not mean that L1 has to | ||
| 853 | * use it. However, VMX features available to L1 will be limited based | ||
| 854 | * on what the enlightened VMCS supports. | ||
| 855 | */ | ||
| 856 | bool enlightened_vmcs_enabled; | ||
| 857 | |||
| 858 | /* L2 must run next, and mustn't decide to exit to L1. */ | ||
| 859 | bool nested_run_pending; | ||
| 860 | |||
| 861 | struct loaded_vmcs vmcs02; | ||
| 862 | |||
| 863 | /* | ||
| 864 | * Guest pages referred to in the vmcs02 with host-physical | ||
| 865 | * pointers, so we must keep them pinned while L2 runs. | ||
| 866 | */ | ||
| 867 | struct page *apic_access_page; | ||
| 868 | struct page *virtual_apic_page; | ||
| 869 | struct page *pi_desc_page; | ||
| 870 | struct pi_desc *pi_desc; | ||
| 871 | bool pi_pending; | ||
| 872 | u16 posted_intr_nv; | ||
| 873 | |||
| 874 | struct hrtimer preemption_timer; | ||
| 875 | bool preemption_timer_expired; | ||
| 876 | |||
| 877 | /* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */ | ||
| 878 | u64 vmcs01_debugctl; | ||
| 879 | u64 vmcs01_guest_bndcfgs; | ||
| 880 | |||
| 881 | u16 vpid02; | ||
| 882 | u16 last_vpid; | ||
| 883 | |||
| 884 | struct nested_vmx_msrs msrs; | ||
| 885 | |||
| 886 | /* SMM related state */ | ||
| 887 | struct { | ||
| 888 | /* in VMX operation on SMM entry? */ | ||
| 889 | bool vmxon; | ||
| 890 | /* in guest mode on SMM entry? */ | ||
| 891 | bool guest_mode; | ||
| 892 | } smm; | ||
| 893 | |||
| 894 | gpa_t hv_evmcs_vmptr; | ||
| 895 | struct page *hv_evmcs_page; | ||
| 896 | struct hv_enlightened_vmcs *hv_evmcs; | ||
| 897 | }; | ||
| 898 | |||
| 899 | #define POSTED_INTR_ON 0 | ||
| 900 | #define POSTED_INTR_SN 1 | ||
| 901 | |||
| 902 | /* Posted-Interrupt Descriptor */ | ||
| 903 | struct pi_desc { | ||
| 904 | u32 pir[8]; /* Posted interrupt requested */ | ||
| 905 | union { | ||
| 906 | struct { | ||
| 907 | /* bit 256 - Outstanding Notification */ | ||
| 908 | u16 on : 1, | ||
| 909 | /* bit 257 - Suppress Notification */ | ||
| 910 | sn : 1, | ||
| 911 | /* bit 271:258 - Reserved */ | ||
| 912 | rsvd_1 : 14; | ||
| 913 | /* bit 279:272 - Notification Vector */ | ||
| 914 | u8 nv; | ||
| 915 | /* bit 287:280 - Reserved */ | ||
| 916 | u8 rsvd_2; | ||
| 917 | /* bit 319:288 - Notification Destination */ | ||
| 918 | u32 ndst; | ||
| 919 | }; | ||
| 920 | u64 control; | ||
| 921 | }; | ||
| 922 | u32 rsvd[6]; | ||
| 923 | } __aligned(64); | ||
| 924 | |||
| 925 | static bool pi_test_and_set_on(struct pi_desc *pi_desc) | ||
| 926 | { | ||
| 927 | return test_and_set_bit(POSTED_INTR_ON, | ||
| 928 | (unsigned long *)&pi_desc->control); | ||
| 929 | } | ||
| 930 | |||
| 931 | static bool pi_test_and_clear_on(struct pi_desc *pi_desc) | ||
| 932 | { | ||
| 933 | return test_and_clear_bit(POSTED_INTR_ON, | ||
| 934 | (unsigned long *)&pi_desc->control); | ||
| 935 | } | ||
| 936 | |||
| 937 | static int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc) | ||
| 938 | { | ||
| 939 | return test_and_set_bit(vector, (unsigned long *)pi_desc->pir); | ||
| 940 | } | ||
| 941 | |||
| 942 | static inline void pi_clear_sn(struct pi_desc *pi_desc) | ||
| 943 | { | ||
| 944 | return clear_bit(POSTED_INTR_SN, | ||
| 945 | (unsigned long *)&pi_desc->control); | ||
| 946 | } | ||
| 947 | |||
| 948 | static inline void pi_set_sn(struct pi_desc *pi_desc) | ||
| 949 | { | ||
| 950 | return set_bit(POSTED_INTR_SN, | ||
| 951 | (unsigned long *)&pi_desc->control); | ||
| 952 | } | ||
| 953 | |||
| 954 | static inline void pi_clear_on(struct pi_desc *pi_desc) | ||
| 955 | { | ||
| 956 | clear_bit(POSTED_INTR_ON, | ||
| 957 | (unsigned long *)&pi_desc->control); | ||
| 958 | } | ||
| 959 | |||
| 960 | static inline int pi_test_on(struct pi_desc *pi_desc) | ||
| 961 | { | ||
| 962 | return test_bit(POSTED_INTR_ON, | ||
| 963 | (unsigned long *)&pi_desc->control); | ||
| 964 | } | ||
| 965 | |||
| 966 | static inline int pi_test_sn(struct pi_desc *pi_desc) | ||
| 967 | { | ||
| 968 | return test_bit(POSTED_INTR_SN, | ||
| 969 | (unsigned long *)&pi_desc->control); | ||
| 970 | } | ||
| 971 | |||
| 972 | struct vmx_msrs { | ||
| 973 | unsigned int nr; | ||
| 974 | struct vmx_msr_entry val[NR_AUTOLOAD_MSRS]; | ||
| 975 | }; | ||
| 976 | |||
| 977 | struct vcpu_vmx { | ||
| 978 | struct kvm_vcpu vcpu; | ||
| 979 | unsigned long host_rsp; | ||
| 980 | u8 fail; | ||
| 981 | u8 msr_bitmap_mode; | ||
| 982 | u32 exit_intr_info; | ||
| 983 | u32 idt_vectoring_info; | ||
| 984 | ulong rflags; | ||
| 985 | struct shared_msr_entry *guest_msrs; | ||
| 986 | int nmsrs; | ||
| 987 | int save_nmsrs; | ||
| 988 | bool guest_msrs_dirty; | ||
| 989 | unsigned long host_idt_base; | ||
| 990 | #ifdef CONFIG_X86_64 | ||
| 991 | u64 msr_host_kernel_gs_base; | ||
| 992 | u64 msr_guest_kernel_gs_base; | ||
| 993 | #endif | ||
| 994 | |||
| 995 | u64 arch_capabilities; | ||
| 996 | u64 spec_ctrl; | ||
| 997 | |||
| 998 | u32 vm_entry_controls_shadow; | ||
| 999 | u32 vm_exit_controls_shadow; | ||
| 1000 | u32 secondary_exec_control; | ||
| 1001 | |||
| 1002 | /* | ||
| 1003 | * loaded_vmcs points to the VMCS currently used in this vcpu. For a | ||
| 1004 | * non-nested (L1) guest, it always points to vmcs01. For a nested | ||
| 1005 | * guest (L2), it points to a different VMCS. loaded_cpu_state points | ||
| 1006 | * to the VMCS whose state is loaded into the CPU registers that only | ||
| 1007 | * need to be switched when transitioning to/from the kernel; a NULL | ||
| 1008 | * value indicates that host state is loaded. | ||
| 1009 | */ | ||
| 1010 | struct loaded_vmcs vmcs01; | ||
| 1011 | struct loaded_vmcs *loaded_vmcs; | ||
| 1012 | struct loaded_vmcs *loaded_cpu_state; | ||
| 1013 | bool __launched; /* temporary, used in vmx_vcpu_run */ | ||
| 1014 | struct msr_autoload { | ||
| 1015 | struct vmx_msrs guest; | ||
| 1016 | struct vmx_msrs host; | ||
| 1017 | } msr_autoload; | ||
| 1018 | |||
| 1019 | struct { | ||
| 1020 | int vm86_active; | ||
| 1021 | ulong save_rflags; | ||
| 1022 | struct kvm_segment segs[8]; | ||
| 1023 | } rmode; | ||
| 1024 | struct { | ||
| 1025 | u32 bitmask; /* 4 bits per segment (1 bit per field) */ | ||
| 1026 | struct kvm_save_segment { | ||
| 1027 | u16 selector; | ||
| 1028 | unsigned long base; | ||
| 1029 | u32 limit; | ||
| 1030 | u32 ar; | ||
| 1031 | } seg[8]; | ||
| 1032 | } segment_cache; | ||
| 1033 | int vpid; | ||
| 1034 | bool emulation_required; | ||
| 1035 | |||
| 1036 | u32 exit_reason; | ||
| 1037 | |||
| 1038 | /* Posted interrupt descriptor */ | ||
| 1039 | struct pi_desc pi_desc; | ||
| 1040 | |||
| 1041 | /* Support for a guest hypervisor (nested VMX) */ | ||
| 1042 | struct nested_vmx nested; | ||
| 1043 | |||
| 1044 | /* Dynamic PLE window. */ | ||
| 1045 | int ple_window; | ||
| 1046 | bool ple_window_dirty; | ||
| 1047 | |||
| 1048 | bool req_immediate_exit; | ||
| 1049 | |||
| 1050 | /* Support for PML */ | ||
| 1051 | #define PML_ENTITY_NUM 512 | ||
| 1052 | struct page *pml_pg; | ||
| 1053 | |||
| 1054 | /* apic deadline value in host tsc */ | ||
| 1055 | u64 hv_deadline_tsc; | ||
| 1056 | |||
| 1057 | u64 current_tsc_ratio; | ||
| 1058 | |||
| 1059 | u32 host_pkru; | ||
| 1060 | |||
| 1061 | unsigned long host_debugctlmsr; | ||
| 1062 | |||
| 1063 | /* | ||
| 1064 | * Only bits masked by msr_ia32_feature_control_valid_bits can be set in | ||
| 1065 | * msr_ia32_feature_control. FEATURE_CONTROL_LOCKED is always included | ||
| 1066 | * in msr_ia32_feature_control_valid_bits. | ||
| 1067 | */ | ||
| 1068 | u64 msr_ia32_feature_control; | ||
| 1069 | u64 msr_ia32_feature_control_valid_bits; | ||
| 1070 | u64 ept_pointer; | ||
| 1071 | }; | ||
| 1072 | |||
| 1073 | enum segment_cache_field { | ||
| 1074 | SEG_FIELD_SEL = 0, | ||
| 1075 | SEG_FIELD_BASE = 1, | ||
| 1076 | SEG_FIELD_LIMIT = 2, | ||
| 1077 | SEG_FIELD_AR = 3, | ||
| 1078 | |||
| 1079 | SEG_FIELD_NR = 4 | ||
| 1080 | }; | ||
| 1081 | |||
| 1082 | static inline struct kvm_vmx *to_kvm_vmx(struct kvm *kvm) | ||
| 1083 | { | ||
| 1084 | return container_of(kvm, struct kvm_vmx, kvm); | ||
| 1085 | } | ||
| 1086 | |||
| 1087 | static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) | ||
| 1088 | { | ||
| 1089 | return container_of(vcpu, struct vcpu_vmx, vcpu); | ||
| 1090 | } | ||
| 1091 | |||
| 1092 | static struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu) | ||
| 1093 | { | ||
| 1094 | return &(to_vmx(vcpu)->pi_desc); | ||
| 1095 | } | ||
| 1096 | |||
| 1097 | #define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n))))) | ||
| 1098 | #define VMCS12_OFFSET(x) offsetof(struct vmcs12, x) | ||
| 1099 | #define FIELD(number, name) [ROL16(number, 6)] = VMCS12_OFFSET(name) | ||
| 1100 | #define FIELD64(number, name) \ | ||
| 1101 | FIELD(number, name), \ | ||
| 1102 | [ROL16(number##_HIGH, 6)] = VMCS12_OFFSET(name) + sizeof(u32) | ||
| 1103 | |||
| 1104 | |||
| 1105 | static u16 shadow_read_only_fields[] = { | ||
| 1106 | #define SHADOW_FIELD_RO(x) x, | ||
| 1107 | #include "vmx_shadow_fields.h" | ||
| 1108 | }; | ||
| 1109 | static int max_shadow_read_only_fields = | ||
| 1110 | ARRAY_SIZE(shadow_read_only_fields); | ||
| 1111 | |||
| 1112 | static u16 shadow_read_write_fields[] = { | ||
| 1113 | #define SHADOW_FIELD_RW(x) x, | ||
| 1114 | #include "vmx_shadow_fields.h" | ||
| 1115 | }; | ||
| 1116 | static int max_shadow_read_write_fields = | ||
| 1117 | ARRAY_SIZE(shadow_read_write_fields); | ||
| 1118 | |||
| 1119 | static const unsigned short vmcs_field_to_offset_table[] = { | ||
| 1120 | FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id), | ||
| 1121 | FIELD(POSTED_INTR_NV, posted_intr_nv), | ||
| 1122 | FIELD(GUEST_ES_SELECTOR, guest_es_selector), | ||
| 1123 | FIELD(GUEST_CS_SELECTOR, guest_cs_selector), | ||
| 1124 | FIELD(GUEST_SS_SELECTOR, guest_ss_selector), | ||
| 1125 | FIELD(GUEST_DS_SELECTOR, guest_ds_selector), | ||
| 1126 | FIELD(GUEST_FS_SELECTOR, guest_fs_selector), | ||
| 1127 | FIELD(GUEST_GS_SELECTOR, guest_gs_selector), | ||
| 1128 | FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector), | ||
| 1129 | FIELD(GUEST_TR_SELECTOR, guest_tr_selector), | ||
| 1130 | FIELD(GUEST_INTR_STATUS, guest_intr_status), | ||
| 1131 | FIELD(GUEST_PML_INDEX, guest_pml_index), | ||
| 1132 | FIELD(HOST_ES_SELECTOR, host_es_selector), | ||
| 1133 | FIELD(HOST_CS_SELECTOR, host_cs_selector), | ||
| 1134 | FIELD(HOST_SS_SELECTOR, host_ss_selector), | ||
| 1135 | FIELD(HOST_DS_SELECTOR, host_ds_selector), | ||
| 1136 | FIELD(HOST_FS_SELECTOR, host_fs_selector), | ||
| 1137 | FIELD(HOST_GS_SELECTOR, host_gs_selector), | ||
| 1138 | FIELD(HOST_TR_SELECTOR, host_tr_selector), | ||
| 1139 | FIELD64(IO_BITMAP_A, io_bitmap_a), | ||
| 1140 | FIELD64(IO_BITMAP_B, io_bitmap_b), | ||
| 1141 | FIELD64(MSR_BITMAP, msr_bitmap), | ||
| 1142 | FIELD64(VM_EXIT_MSR_STORE_ADDR, vm_exit_msr_store_addr), | ||
| 1143 | FIELD64(VM_EXIT_MSR_LOAD_ADDR, vm_exit_msr_load_addr), | ||
| 1144 | FIELD64(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr), | ||
| 1145 | FIELD64(PML_ADDRESS, pml_address), | ||
| 1146 | FIELD64(TSC_OFFSET, tsc_offset), | ||
| 1147 | FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr), | ||
| 1148 | FIELD64(APIC_ACCESS_ADDR, apic_access_addr), | ||
| 1149 | FIELD64(POSTED_INTR_DESC_ADDR, posted_intr_desc_addr), | ||
| 1150 | FIELD64(VM_FUNCTION_CONTROL, vm_function_control), | ||
| 1151 | FIELD64(EPT_POINTER, ept_pointer), | ||
| 1152 | FIELD64(EOI_EXIT_BITMAP0, eoi_exit_bitmap0), | ||
| 1153 | FIELD64(EOI_EXIT_BITMAP1, eoi_exit_bitmap1), | ||
| 1154 | FIELD64(EOI_EXIT_BITMAP2, eoi_exit_bitmap2), | ||
| 1155 | FIELD64(EOI_EXIT_BITMAP3, eoi_exit_bitmap3), | ||
| 1156 | FIELD64(EPTP_LIST_ADDRESS, eptp_list_address), | ||
| 1157 | FIELD64(VMREAD_BITMAP, vmread_bitmap), | ||
| 1158 | FIELD64(VMWRITE_BITMAP, vmwrite_bitmap), | ||
| 1159 | FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap), | ||
| 1160 | FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address), | ||
| 1161 | FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer), | ||
| 1162 | FIELD64(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl), | ||
| 1163 | FIELD64(GUEST_IA32_PAT, guest_ia32_pat), | ||
| 1164 | FIELD64(GUEST_IA32_EFER, guest_ia32_efer), | ||
| 1165 | FIELD64(GUEST_IA32_PERF_GLOBAL_CTRL, guest_ia32_perf_global_ctrl), | ||
| 1166 | FIELD64(GUEST_PDPTR0, guest_pdptr0), | ||
| 1167 | FIELD64(GUEST_PDPTR1, guest_pdptr1), | ||
| 1168 | FIELD64(GUEST_PDPTR2, guest_pdptr2), | ||
| 1169 | FIELD64(GUEST_PDPTR3, guest_pdptr3), | ||
| 1170 | FIELD64(GUEST_BNDCFGS, guest_bndcfgs), | ||
| 1171 | FIELD64(HOST_IA32_PAT, host_ia32_pat), | ||
| 1172 | FIELD64(HOST_IA32_EFER, host_ia32_efer), | ||
| 1173 | FIELD64(HOST_IA32_PERF_GLOBAL_CTRL, host_ia32_perf_global_ctrl), | ||
| 1174 | FIELD(PIN_BASED_VM_EXEC_CONTROL, pin_based_vm_exec_control), | ||
| 1175 | FIELD(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control), | ||
| 1176 | FIELD(EXCEPTION_BITMAP, exception_bitmap), | ||
| 1177 | FIELD(PAGE_FAULT_ERROR_CODE_MASK, page_fault_error_code_mask), | ||
| 1178 | FIELD(PAGE_FAULT_ERROR_CODE_MATCH, page_fault_error_code_match), | ||
| 1179 | FIELD(CR3_TARGET_COUNT, cr3_target_count), | ||
| 1180 | FIELD(VM_EXIT_CONTROLS, vm_exit_controls), | ||
| 1181 | FIELD(VM_EXIT_MSR_STORE_COUNT, vm_exit_msr_store_count), | ||
| 1182 | FIELD(VM_EXIT_MSR_LOAD_COUNT, vm_exit_msr_load_count), | ||
| 1183 | FIELD(VM_ENTRY_CONTROLS, vm_entry_controls), | ||
| 1184 | FIELD(VM_ENTRY_MSR_LOAD_COUNT, vm_entry_msr_load_count), | ||
| 1185 | FIELD(VM_ENTRY_INTR_INFO_FIELD, vm_entry_intr_info_field), | ||
| 1186 | FIELD(VM_ENTRY_EXCEPTION_ERROR_CODE, vm_entry_exception_error_code), | ||
| 1187 | FIELD(VM_ENTRY_INSTRUCTION_LEN, vm_entry_instruction_len), | ||
| 1188 | FIELD(TPR_THRESHOLD, tpr_threshold), | ||
| 1189 | FIELD(SECONDARY_VM_EXEC_CONTROL, secondary_vm_exec_control), | ||
| 1190 | FIELD(VM_INSTRUCTION_ERROR, vm_instruction_error), | ||
| 1191 | FIELD(VM_EXIT_REASON, vm_exit_reason), | ||
| 1192 | FIELD(VM_EXIT_INTR_INFO, vm_exit_intr_info), | ||
| 1193 | FIELD(VM_EXIT_INTR_ERROR_CODE, vm_exit_intr_error_code), | ||
| 1194 | FIELD(IDT_VECTORING_INFO_FIELD, idt_vectoring_info_field), | ||
| 1195 | FIELD(IDT_VECTORING_ERROR_CODE, idt_vectoring_error_code), | ||
| 1196 | FIELD(VM_EXIT_INSTRUCTION_LEN, vm_exit_instruction_len), | ||
| 1197 | FIELD(VMX_INSTRUCTION_INFO, vmx_instruction_info), | ||
| 1198 | FIELD(GUEST_ES_LIMIT, guest_es_limit), | ||
| 1199 | FIELD(GUEST_CS_LIMIT, guest_cs_limit), | ||
| 1200 | FIELD(GUEST_SS_LIMIT, guest_ss_limit), | ||
| 1201 | FIELD(GUEST_DS_LIMIT, guest_ds_limit), | ||
| 1202 | FIELD(GUEST_FS_LIMIT, guest_fs_limit), | ||
| 1203 | FIELD(GUEST_GS_LIMIT, guest_gs_limit), | ||
| 1204 | FIELD(GUEST_LDTR_LIMIT, guest_ldtr_limit), | ||
| 1205 | FIELD(GUEST_TR_LIMIT, guest_tr_limit), | ||
| 1206 | FIELD(GUEST_GDTR_LIMIT, guest_gdtr_limit), | ||
| 1207 | FIELD(GUEST_IDTR_LIMIT, guest_idtr_limit), | ||
| 1208 | FIELD(GUEST_ES_AR_BYTES, guest_es_ar_bytes), | ||
| 1209 | FIELD(GUEST_CS_AR_BYTES, guest_cs_ar_bytes), | ||
| 1210 | FIELD(GUEST_SS_AR_BYTES, guest_ss_ar_bytes), | ||
| 1211 | FIELD(GUEST_DS_AR_BYTES, guest_ds_ar_bytes), | ||
| 1212 | FIELD(GUEST_FS_AR_BYTES, guest_fs_ar_bytes), | ||
| 1213 | FIELD(GUEST_GS_AR_BYTES, guest_gs_ar_bytes), | ||
| 1214 | FIELD(GUEST_LDTR_AR_BYTES, guest_ldtr_ar_bytes), | ||
| 1215 | FIELD(GUEST_TR_AR_BYTES, guest_tr_ar_bytes), | ||
| 1216 | FIELD(GUEST_INTERRUPTIBILITY_INFO, guest_interruptibility_info), | ||
| 1217 | FIELD(GUEST_ACTIVITY_STATE, guest_activity_state), | ||
| 1218 | FIELD(GUEST_SYSENTER_CS, guest_sysenter_cs), | ||
| 1219 | FIELD(HOST_IA32_SYSENTER_CS, host_ia32_sysenter_cs), | ||
| 1220 | FIELD(VMX_PREEMPTION_TIMER_VALUE, vmx_preemption_timer_value), | ||
| 1221 | FIELD(CR0_GUEST_HOST_MASK, cr0_guest_host_mask), | ||
| 1222 | FIELD(CR4_GUEST_HOST_MASK, cr4_guest_host_mask), | ||
| 1223 | FIELD(CR0_READ_SHADOW, cr0_read_shadow), | ||
| 1224 | FIELD(CR4_READ_SHADOW, cr4_read_shadow), | ||
| 1225 | FIELD(CR3_TARGET_VALUE0, cr3_target_value0), | ||
| 1226 | FIELD(CR3_TARGET_VALUE1, cr3_target_value1), | ||
| 1227 | FIELD(CR3_TARGET_VALUE2, cr3_target_value2), | ||
| 1228 | FIELD(CR3_TARGET_VALUE3, cr3_target_value3), | ||
| 1229 | FIELD(EXIT_QUALIFICATION, exit_qualification), | ||
| 1230 | FIELD(GUEST_LINEAR_ADDRESS, guest_linear_address), | ||
| 1231 | FIELD(GUEST_CR0, guest_cr0), | ||
| 1232 | FIELD(GUEST_CR3, guest_cr3), | ||
| 1233 | FIELD(GUEST_CR4, guest_cr4), | ||
| 1234 | FIELD(GUEST_ES_BASE, guest_es_base), | ||
| 1235 | FIELD(GUEST_CS_BASE, guest_cs_base), | ||
| 1236 | FIELD(GUEST_SS_BASE, guest_ss_base), | ||
| 1237 | FIELD(GUEST_DS_BASE, guest_ds_base), | ||
| 1238 | FIELD(GUEST_FS_BASE, guest_fs_base), | ||
| 1239 | FIELD(GUEST_GS_BASE, guest_gs_base), | ||
| 1240 | FIELD(GUEST_LDTR_BASE, guest_ldtr_base), | ||
| 1241 | FIELD(GUEST_TR_BASE, guest_tr_base), | ||
| 1242 | FIELD(GUEST_GDTR_BASE, guest_gdtr_base), | ||
| 1243 | FIELD(GUEST_IDTR_BASE, guest_idtr_base), | ||
| 1244 | FIELD(GUEST_DR7, guest_dr7), | ||
| 1245 | FIELD(GUEST_RSP, guest_rsp), | ||
| 1246 | FIELD(GUEST_RIP, guest_rip), | ||
| 1247 | FIELD(GUEST_RFLAGS, guest_rflags), | ||
| 1248 | FIELD(GUEST_PENDING_DBG_EXCEPTIONS, guest_pending_dbg_exceptions), | ||
| 1249 | FIELD(GUEST_SYSENTER_ESP, guest_sysenter_esp), | ||
| 1250 | FIELD(GUEST_SYSENTER_EIP, guest_sysenter_eip), | ||
| 1251 | FIELD(HOST_CR0, host_cr0), | ||
| 1252 | FIELD(HOST_CR3, host_cr3), | ||
| 1253 | FIELD(HOST_CR4, host_cr4), | ||
| 1254 | FIELD(HOST_FS_BASE, host_fs_base), | ||
| 1255 | FIELD(HOST_GS_BASE, host_gs_base), | ||
| 1256 | FIELD(HOST_TR_BASE, host_tr_base), | ||
| 1257 | FIELD(HOST_GDTR_BASE, host_gdtr_base), | ||
| 1258 | FIELD(HOST_IDTR_BASE, host_idtr_base), | ||
| 1259 | FIELD(HOST_IA32_SYSENTER_ESP, host_ia32_sysenter_esp), | ||
| 1260 | FIELD(HOST_IA32_SYSENTER_EIP, host_ia32_sysenter_eip), | ||
| 1261 | FIELD(HOST_RSP, host_rsp), | ||
| 1262 | FIELD(HOST_RIP, host_rip), | ||
| 1263 | }; | ||
| 1264 | |||
| 1265 | static inline short vmcs_field_to_offset(unsigned long field) | ||
| 1266 | { | ||
| 1267 | const size_t size = ARRAY_SIZE(vmcs_field_to_offset_table); | ||
| 1268 | unsigned short offset; | ||
| 1269 | unsigned index; | ||
| 1270 | |||
| 1271 | if (field >> 15) | ||
| 1272 | return -ENOENT; | ||
| 1273 | |||
| 1274 | index = ROL16(field, 6); | ||
| 1275 | if (index >= size) | ||
| 1276 | return -ENOENT; | ||
| 1277 | |||
| 1278 | index = array_index_nospec(index, size); | ||
| 1279 | offset = vmcs_field_to_offset_table[index]; | ||
| 1280 | if (offset == 0) | ||
| 1281 | return -ENOENT; | ||
| 1282 | return offset; | ||
| 1283 | } | ||
| 1284 | |||
| 1285 | static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu) | ||
| 1286 | { | ||
| 1287 | return to_vmx(vcpu)->nested.cached_vmcs12; | ||
| 1288 | } | ||
| 1289 | |||
| 1290 | static inline struct vmcs12 *get_shadow_vmcs12(struct kvm_vcpu *vcpu) | ||
| 1291 | { | ||
| 1292 | return to_vmx(vcpu)->nested.cached_shadow_vmcs12; | ||
| 1293 | } | ||
| 1294 | |||
| 1295 | static bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu); | ||
| 1296 | static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu); | ||
| 1297 | static u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa); | ||
| 1298 | static bool vmx_xsaves_supported(void); | ||
| 1299 | static void vmx_set_segment(struct kvm_vcpu *vcpu, | ||
| 1300 | struct kvm_segment *var, int seg); | ||
| 1301 | static void vmx_get_segment(struct kvm_vcpu *vcpu, | ||
| 1302 | struct kvm_segment *var, int seg); | ||
| 1303 | static bool guest_state_valid(struct kvm_vcpu *vcpu); | ||
| 1304 | static u32 vmx_segment_access_rights(struct kvm_segment *var); | ||
| 1305 | static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx); | ||
| 1306 | static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu); | ||
| 1307 | static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked); | ||
| 1308 | static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12, | ||
| 1309 | u16 error_code); | ||
| 1310 | static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu); | ||
| 1311 | static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, | ||
| 1312 | u32 msr, int type); | ||
| 1313 | |||
| 1314 | static DEFINE_PER_CPU(struct vmcs *, vmxarea); | ||
| 1315 | static DEFINE_PER_CPU(struct vmcs *, current_vmcs); | ||
| 1316 | /* | ||
| 1317 | * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed | ||
| 1318 | * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it. | ||
| 1319 | */ | ||
| 1320 | static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu); | ||
| 1321 | |||
| 1322 | /* | ||
| 1323 | * We maintian a per-CPU linked-list of vCPU, so in wakeup_handler() we | ||
| 1324 | * can find which vCPU should be waken up. | ||
| 1325 | */ | ||
| 1326 | static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu); | ||
| 1327 | static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock); | ||
| 1328 | |||
| 1329 | enum { | ||
| 1330 | VMX_VMREAD_BITMAP, | ||
| 1331 | VMX_VMWRITE_BITMAP, | ||
| 1332 | VMX_BITMAP_NR | ||
| 1333 | }; | ||
| 1334 | |||
| 1335 | static unsigned long *vmx_bitmap[VMX_BITMAP_NR]; | ||
| 1336 | |||
| 1337 | #define vmx_vmread_bitmap (vmx_bitmap[VMX_VMREAD_BITMAP]) | ||
| 1338 | #define vmx_vmwrite_bitmap (vmx_bitmap[VMX_VMWRITE_BITMAP]) | ||
| 1339 | |||
| 1340 | static bool cpu_has_load_ia32_efer; | ||
| 1341 | static bool cpu_has_load_perf_global_ctrl; | ||
| 1342 | |||
| 1343 | static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS); | ||
| 1344 | static DEFINE_SPINLOCK(vmx_vpid_lock); | ||
| 1345 | |||
| 1346 | static struct vmcs_config { | ||
| 1347 | int size; | ||
| 1348 | int order; | ||
| 1349 | u32 basic_cap; | ||
| 1350 | u32 revision_id; | ||
| 1351 | u32 pin_based_exec_ctrl; | ||
| 1352 | u32 cpu_based_exec_ctrl; | ||
| 1353 | u32 cpu_based_2nd_exec_ctrl; | ||
| 1354 | u32 vmexit_ctrl; | ||
| 1355 | u32 vmentry_ctrl; | ||
| 1356 | struct nested_vmx_msrs nested; | ||
| 1357 | } vmcs_config; | ||
| 1358 | |||
| 1359 | static struct vmx_capability { | ||
| 1360 | u32 ept; | ||
| 1361 | u32 vpid; | ||
| 1362 | } vmx_capability; | ||
| 1363 | |||
| 1364 | #define VMX_SEGMENT_FIELD(seg) \ | ||
| 1365 | [VCPU_SREG_##seg] = { \ | ||
| 1366 | .selector = GUEST_##seg##_SELECTOR, \ | ||
| 1367 | .base = GUEST_##seg##_BASE, \ | ||
| 1368 | .limit = GUEST_##seg##_LIMIT, \ | ||
| 1369 | .ar_bytes = GUEST_##seg##_AR_BYTES, \ | ||
| 1370 | } | ||
| 1371 | |||
| 1372 | static const struct kvm_vmx_segment_field { | ||
| 1373 | unsigned selector; | ||
| 1374 | unsigned base; | ||
| 1375 | unsigned limit; | ||
| 1376 | unsigned ar_bytes; | ||
| 1377 | } kvm_vmx_segment_fields[] = { | ||
| 1378 | VMX_SEGMENT_FIELD(CS), | ||
| 1379 | VMX_SEGMENT_FIELD(DS), | ||
| 1380 | VMX_SEGMENT_FIELD(ES), | ||
| 1381 | VMX_SEGMENT_FIELD(FS), | ||
| 1382 | VMX_SEGMENT_FIELD(GS), | ||
| 1383 | VMX_SEGMENT_FIELD(SS), | ||
| 1384 | VMX_SEGMENT_FIELD(TR), | ||
| 1385 | VMX_SEGMENT_FIELD(LDTR), | ||
| 1386 | }; | ||
| 1387 | |||
| 1388 | static u64 host_efer; | ||
| 1389 | |||
| 1390 | static void ept_save_pdptrs(struct kvm_vcpu *vcpu); | ||
| 1391 | |||
| 1392 | /* | ||
| 1393 | * Keep MSR_STAR at the end, as setup_msrs() will try to optimize it | ||
| 1394 | * away by decrementing the array size. | ||
| 1395 | */ | ||
| 1396 | static const u32 vmx_msr_index[] = { | ||
| 1397 | #ifdef CONFIG_X86_64 | ||
| 1398 | MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, | ||
| 1399 | #endif | ||
| 1400 | MSR_EFER, MSR_TSC_AUX, MSR_STAR, | ||
| 1401 | }; | ||
| 1402 | |||
| 1403 | DEFINE_STATIC_KEY_FALSE(enable_evmcs); | ||
| 1404 | |||
| 1405 | #define current_evmcs ((struct hv_enlightened_vmcs *)this_cpu_read(current_vmcs)) | ||
| 1406 | |||
| 1407 | #define KVM_EVMCS_VERSION 1 | ||
| 1408 | |||
| 1409 | /* | ||
| 1410 | * Enlightened VMCSv1 doesn't support these: | ||
| 1411 | * | ||
| 1412 | * POSTED_INTR_NV = 0x00000002, | ||
| 1413 | * GUEST_INTR_STATUS = 0x00000810, | ||
| 1414 | * APIC_ACCESS_ADDR = 0x00002014, | ||
| 1415 | * POSTED_INTR_DESC_ADDR = 0x00002016, | ||
| 1416 | * EOI_EXIT_BITMAP0 = 0x0000201c, | ||
| 1417 | * EOI_EXIT_BITMAP1 = 0x0000201e, | ||
| 1418 | * EOI_EXIT_BITMAP2 = 0x00002020, | ||
| 1419 | * EOI_EXIT_BITMAP3 = 0x00002022, | ||
| 1420 | * GUEST_PML_INDEX = 0x00000812, | ||
| 1421 | * PML_ADDRESS = 0x0000200e, | ||
| 1422 | * VM_FUNCTION_CONTROL = 0x00002018, | ||
| 1423 | * EPTP_LIST_ADDRESS = 0x00002024, | ||
| 1424 | * VMREAD_BITMAP = 0x00002026, | ||
| 1425 | * VMWRITE_BITMAP = 0x00002028, | ||
| 1426 | * | ||
| 1427 | * TSC_MULTIPLIER = 0x00002032, | ||
| 1428 | * PLE_GAP = 0x00004020, | ||
| 1429 | * PLE_WINDOW = 0x00004022, | ||
| 1430 | * VMX_PREEMPTION_TIMER_VALUE = 0x0000482E, | ||
| 1431 | * GUEST_IA32_PERF_GLOBAL_CTRL = 0x00002808, | ||
| 1432 | * HOST_IA32_PERF_GLOBAL_CTRL = 0x00002c04, | ||
| 1433 | * | ||
| 1434 | * Currently unsupported in KVM: | ||
| 1435 | * GUEST_IA32_RTIT_CTL = 0x00002814, | ||
| 1436 | */ | ||
| 1437 | #define EVMCS1_UNSUPPORTED_PINCTRL (PIN_BASED_POSTED_INTR | \ | ||
| 1438 | PIN_BASED_VMX_PREEMPTION_TIMER) | ||
| 1439 | #define EVMCS1_UNSUPPORTED_2NDEXEC \ | ||
| 1440 | (SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | \ | ||
| 1441 | SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | \ | ||
| 1442 | SECONDARY_EXEC_APIC_REGISTER_VIRT | \ | ||
| 1443 | SECONDARY_EXEC_ENABLE_PML | \ | ||
| 1444 | SECONDARY_EXEC_ENABLE_VMFUNC | \ | ||
| 1445 | SECONDARY_EXEC_SHADOW_VMCS | \ | ||
| 1446 | SECONDARY_EXEC_TSC_SCALING | \ | ||
| 1447 | SECONDARY_EXEC_PAUSE_LOOP_EXITING) | ||
| 1448 | #define EVMCS1_UNSUPPORTED_VMEXIT_CTRL (VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) | ||
| 1449 | #define EVMCS1_UNSUPPORTED_VMENTRY_CTRL (VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) | ||
| 1450 | #define EVMCS1_UNSUPPORTED_VMFUNC (VMX_VMFUNC_EPTP_SWITCHING) | ||
| 1451 | |||
| 1452 | #if IS_ENABLED(CONFIG_HYPERV) | ||
| 1453 | static bool __read_mostly enlightened_vmcs = true; | ||
| 1454 | module_param(enlightened_vmcs, bool, 0444); | ||
| 1455 | |||
| 1456 | static inline void evmcs_write64(unsigned long field, u64 value) | ||
| 1457 | { | ||
| 1458 | u16 clean_field; | ||
| 1459 | int offset = get_evmcs_offset(field, &clean_field); | ||
| 1460 | |||
| 1461 | if (offset < 0) | ||
| 1462 | return; | ||
| 1463 | |||
| 1464 | *(u64 *)((char *)current_evmcs + offset) = value; | ||
| 1465 | |||
| 1466 | current_evmcs->hv_clean_fields &= ~clean_field; | ||
| 1467 | } | ||
| 1468 | |||
| 1469 | static inline void evmcs_write32(unsigned long field, u32 value) | ||
| 1470 | { | ||
| 1471 | u16 clean_field; | ||
| 1472 | int offset = get_evmcs_offset(field, &clean_field); | ||
| 1473 | |||
| 1474 | if (offset < 0) | ||
| 1475 | return; | ||
| 1476 | |||
| 1477 | *(u32 *)((char *)current_evmcs + offset) = value; | ||
| 1478 | current_evmcs->hv_clean_fields &= ~clean_field; | ||
| 1479 | } | ||
| 1480 | |||
| 1481 | static inline void evmcs_write16(unsigned long field, u16 value) | ||
| 1482 | { | ||
| 1483 | u16 clean_field; | ||
| 1484 | int offset = get_evmcs_offset(field, &clean_field); | ||
| 1485 | |||
| 1486 | if (offset < 0) | ||
| 1487 | return; | ||
| 1488 | |||
| 1489 | *(u16 *)((char *)current_evmcs + offset) = value; | ||
| 1490 | current_evmcs->hv_clean_fields &= ~clean_field; | ||
| 1491 | } | ||
| 1492 | |||
| 1493 | static inline u64 evmcs_read64(unsigned long field) | ||
| 1494 | { | ||
| 1495 | int offset = get_evmcs_offset(field, NULL); | ||
| 1496 | |||
| 1497 | if (offset < 0) | ||
| 1498 | return 0; | ||
| 1499 | |||
| 1500 | return *(u64 *)((char *)current_evmcs + offset); | ||
| 1501 | } | ||
| 1502 | |||
| 1503 | static inline u32 evmcs_read32(unsigned long field) | ||
| 1504 | { | ||
| 1505 | int offset = get_evmcs_offset(field, NULL); | ||
| 1506 | |||
| 1507 | if (offset < 0) | ||
| 1508 | return 0; | ||
| 1509 | |||
| 1510 | return *(u32 *)((char *)current_evmcs + offset); | ||
| 1511 | } | ||
| 1512 | |||
| 1513 | static inline u16 evmcs_read16(unsigned long field) | ||
| 1514 | { | ||
| 1515 | int offset = get_evmcs_offset(field, NULL); | ||
| 1516 | |||
| 1517 | if (offset < 0) | ||
| 1518 | return 0; | ||
| 1519 | |||
| 1520 | return *(u16 *)((char *)current_evmcs + offset); | ||
| 1521 | } | ||
| 1522 | |||
| 1523 | static inline void evmcs_touch_msr_bitmap(void) | ||
| 1524 | { | ||
| 1525 | if (unlikely(!current_evmcs)) | ||
| 1526 | return; | ||
| 1527 | |||
| 1528 | if (current_evmcs->hv_enlightenments_control.msr_bitmap) | ||
| 1529 | current_evmcs->hv_clean_fields &= | ||
| 1530 | ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP; | ||
| 1531 | } | ||
| 1532 | |||
| 1533 | static void evmcs_load(u64 phys_addr) | ||
| 1534 | { | ||
| 1535 | struct hv_vp_assist_page *vp_ap = | ||
| 1536 | hv_get_vp_assist_page(smp_processor_id()); | ||
| 1537 | |||
| 1538 | vp_ap->current_nested_vmcs = phys_addr; | ||
| 1539 | vp_ap->enlighten_vmentry = 1; | ||
| 1540 | } | ||
| 1541 | |||
| 1542 | static void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf) | ||
| 1543 | { | ||
| 1544 | vmcs_conf->pin_based_exec_ctrl &= ~EVMCS1_UNSUPPORTED_PINCTRL; | ||
| 1545 | vmcs_conf->cpu_based_2nd_exec_ctrl &= ~EVMCS1_UNSUPPORTED_2NDEXEC; | ||
| 1546 | |||
| 1547 | vmcs_conf->vmexit_ctrl &= ~EVMCS1_UNSUPPORTED_VMEXIT_CTRL; | ||
| 1548 | vmcs_conf->vmentry_ctrl &= ~EVMCS1_UNSUPPORTED_VMENTRY_CTRL; | ||
| 1549 | |||
| 1550 | } | ||
| 1551 | |||
| 1552 | /* check_ept_pointer() should be under protection of ept_pointer_lock. */ | ||
| 1553 | static void check_ept_pointer_match(struct kvm *kvm) | ||
| 1554 | { | ||
| 1555 | struct kvm_vcpu *vcpu; | ||
| 1556 | u64 tmp_eptp = INVALID_PAGE; | ||
| 1557 | int i; | ||
| 1558 | |||
| 1559 | kvm_for_each_vcpu(i, vcpu, kvm) { | ||
| 1560 | if (!VALID_PAGE(tmp_eptp)) { | ||
| 1561 | tmp_eptp = to_vmx(vcpu)->ept_pointer; | ||
| 1562 | } else if (tmp_eptp != to_vmx(vcpu)->ept_pointer) { | ||
| 1563 | to_kvm_vmx(kvm)->ept_pointers_match | ||
| 1564 | = EPT_POINTERS_MISMATCH; | ||
| 1565 | return; | ||
| 1566 | } | ||
| 1567 | } | ||
| 1568 | |||
| 1569 | to_kvm_vmx(kvm)->ept_pointers_match = EPT_POINTERS_MATCH; | ||
| 1570 | } | ||
| 1571 | |||
| 1572 | static int vmx_hv_remote_flush_tlb(struct kvm *kvm) | ||
| 1573 | { | ||
| 1574 | struct kvm_vcpu *vcpu; | ||
| 1575 | int ret = -ENOTSUPP, i; | ||
| 1576 | |||
| 1577 | spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock); | ||
| 1578 | |||
| 1579 | if (to_kvm_vmx(kvm)->ept_pointers_match == EPT_POINTERS_CHECK) | ||
| 1580 | check_ept_pointer_match(kvm); | ||
| 1581 | |||
| 1582 | /* | ||
| 1583 | * FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE hypercall needs the address of the | ||
| 1584 | * base of EPT PML4 table, strip off EPT configuration information. | ||
| 1585 | */ | ||
| 1586 | if (to_kvm_vmx(kvm)->ept_pointers_match != EPT_POINTERS_MATCH) { | ||
| 1587 | kvm_for_each_vcpu(i, vcpu, kvm) | ||
| 1588 | ret |= hyperv_flush_guest_mapping( | ||
| 1589 | to_vmx(kvm_get_vcpu(kvm, i))->ept_pointer & PAGE_MASK); | ||
| 1590 | } else { | ||
| 1591 | ret = hyperv_flush_guest_mapping( | ||
| 1592 | to_vmx(kvm_get_vcpu(kvm, 0))->ept_pointer & PAGE_MASK); | ||
| 1593 | } | ||
| 1594 | |||
| 1595 | spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock); | ||
| 1596 | return ret; | ||
| 1597 | } | ||
| 1598 | #else /* !IS_ENABLED(CONFIG_HYPERV) */ | ||
| 1599 | static inline void evmcs_write64(unsigned long field, u64 value) {} | ||
| 1600 | static inline void evmcs_write32(unsigned long field, u32 value) {} | ||
| 1601 | static inline void evmcs_write16(unsigned long field, u16 value) {} | ||
| 1602 | static inline u64 evmcs_read64(unsigned long field) { return 0; } | ||
| 1603 | static inline u32 evmcs_read32(unsigned long field) { return 0; } | ||
| 1604 | static inline u16 evmcs_read16(unsigned long field) { return 0; } | ||
| 1605 | static inline void evmcs_load(u64 phys_addr) {} | ||
| 1606 | static inline void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf) {} | ||
| 1607 | static inline void evmcs_touch_msr_bitmap(void) {} | ||
| 1608 | #endif /* IS_ENABLED(CONFIG_HYPERV) */ | ||
| 1609 | |||
| 1610 | static int nested_enable_evmcs(struct kvm_vcpu *vcpu, | ||
| 1611 | uint16_t *vmcs_version) | ||
| 1612 | { | ||
| 1613 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 1614 | |||
| 1615 | /* | ||
| 1616 | * vmcs_version represents the range of supported Enlightened VMCS | ||
| 1617 | * versions: lower 8 bits is the minimal version, higher 8 bits is the | ||
| 1618 | * maximum supported version. KVM supports versions from 1 to | ||
| 1619 | * KVM_EVMCS_VERSION. | ||
| 1620 | */ | ||
| 1621 | if (vmcs_version) | ||
| 1622 | *vmcs_version = (KVM_EVMCS_VERSION << 8) | 1; | ||
| 1623 | |||
| 1624 | /* We don't support disabling the feature for simplicity. */ | ||
| 1625 | if (vmx->nested.enlightened_vmcs_enabled) | ||
| 1626 | return 0; | ||
| 1627 | |||
| 1628 | vmx->nested.enlightened_vmcs_enabled = true; | ||
| 1629 | |||
| 1630 | vmx->nested.msrs.pinbased_ctls_high &= ~EVMCS1_UNSUPPORTED_PINCTRL; | ||
| 1631 | vmx->nested.msrs.entry_ctls_high &= ~EVMCS1_UNSUPPORTED_VMENTRY_CTRL; | ||
| 1632 | vmx->nested.msrs.exit_ctls_high &= ~EVMCS1_UNSUPPORTED_VMEXIT_CTRL; | ||
| 1633 | vmx->nested.msrs.secondary_ctls_high &= ~EVMCS1_UNSUPPORTED_2NDEXEC; | ||
| 1634 | vmx->nested.msrs.vmfunc_controls &= ~EVMCS1_UNSUPPORTED_VMFUNC; | ||
| 1635 | |||
| 1636 | return 0; | ||
| 1637 | } | ||
| 1638 | |||
| 1639 | static inline bool is_exception_n(u32 intr_info, u8 vector) | ||
| 1640 | { | ||
| 1641 | return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | | ||
| 1642 | INTR_INFO_VALID_MASK)) == | ||
| 1643 | (INTR_TYPE_HARD_EXCEPTION | vector | INTR_INFO_VALID_MASK); | ||
| 1644 | } | ||
| 1645 | |||
| 1646 | static inline bool is_debug(u32 intr_info) | ||
| 1647 | { | ||
| 1648 | return is_exception_n(intr_info, DB_VECTOR); | ||
| 1649 | } | ||
| 1650 | |||
| 1651 | static inline bool is_breakpoint(u32 intr_info) | ||
| 1652 | { | ||
| 1653 | return is_exception_n(intr_info, BP_VECTOR); | ||
| 1654 | } | ||
| 1655 | |||
| 1656 | static inline bool is_page_fault(u32 intr_info) | ||
| 1657 | { | ||
| 1658 | return is_exception_n(intr_info, PF_VECTOR); | ||
| 1659 | } | ||
| 1660 | |||
| 1661 | static inline bool is_invalid_opcode(u32 intr_info) | ||
| 1662 | { | ||
| 1663 | return is_exception_n(intr_info, UD_VECTOR); | ||
| 1664 | } | ||
| 1665 | |||
| 1666 | static inline bool is_gp_fault(u32 intr_info) | ||
| 1667 | { | ||
| 1668 | return is_exception_n(intr_info, GP_VECTOR); | ||
| 1669 | } | ||
| 1670 | |||
| 1671 | static inline bool is_machine_check(u32 intr_info) | ||
| 1672 | { | ||
| 1673 | return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | | ||
| 1674 | INTR_INFO_VALID_MASK)) == | ||
| 1675 | (INTR_TYPE_HARD_EXCEPTION | MC_VECTOR | INTR_INFO_VALID_MASK); | ||
| 1676 | } | ||
| 1677 | |||
| 1678 | /* Undocumented: icebp/int1 */ | ||
| 1679 | static inline bool is_icebp(u32 intr_info) | ||
| 1680 | { | ||
| 1681 | return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) | ||
| 1682 | == (INTR_TYPE_PRIV_SW_EXCEPTION | INTR_INFO_VALID_MASK); | ||
| 1683 | } | ||
| 1684 | |||
| 1685 | static inline bool cpu_has_vmx_msr_bitmap(void) | ||
| 1686 | { | ||
| 1687 | return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS; | ||
| 1688 | } | ||
| 1689 | |||
| 1690 | static inline bool cpu_has_vmx_tpr_shadow(void) | ||
| 1691 | { | ||
| 1692 | return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW; | ||
| 1693 | } | ||
| 1694 | |||
| 1695 | static inline bool cpu_need_tpr_shadow(struct kvm_vcpu *vcpu) | ||
| 1696 | { | ||
| 1697 | return cpu_has_vmx_tpr_shadow() && lapic_in_kernel(vcpu); | ||
| 1698 | } | ||
| 1699 | |||
| 1700 | static inline bool cpu_has_secondary_exec_ctrls(void) | ||
| 1701 | { | ||
| 1702 | return vmcs_config.cpu_based_exec_ctrl & | ||
| 1703 | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; | ||
| 1704 | } | ||
| 1705 | |||
| 1706 | static inline bool cpu_has_vmx_virtualize_apic_accesses(void) | ||
| 1707 | { | ||
| 1708 | return vmcs_config.cpu_based_2nd_exec_ctrl & | ||
| 1709 | SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; | ||
| 1710 | } | ||
| 1711 | |||
| 1712 | static inline bool cpu_has_vmx_virtualize_x2apic_mode(void) | ||
| 1713 | { | ||
| 1714 | return vmcs_config.cpu_based_2nd_exec_ctrl & | ||
| 1715 | SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; | ||
| 1716 | } | ||
| 1717 | |||
| 1718 | static inline bool cpu_has_vmx_apic_register_virt(void) | ||
| 1719 | { | ||
| 1720 | return vmcs_config.cpu_based_2nd_exec_ctrl & | ||
| 1721 | SECONDARY_EXEC_APIC_REGISTER_VIRT; | ||
| 1722 | } | ||
| 1723 | |||
| 1724 | static inline bool cpu_has_vmx_virtual_intr_delivery(void) | ||
| 1725 | { | ||
| 1726 | return vmcs_config.cpu_based_2nd_exec_ctrl & | ||
| 1727 | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY; | ||
| 1728 | } | ||
| 1729 | |||
| 1730 | static inline bool cpu_has_vmx_encls_vmexit(void) | ||
| 1731 | { | ||
| 1732 | return vmcs_config.cpu_based_2nd_exec_ctrl & | ||
| 1733 | SECONDARY_EXEC_ENCLS_EXITING; | ||
| 1734 | } | ||
| 1735 | |||
| 1736 | /* | ||
| 1737 | * Comment's format: document - errata name - stepping - processor name. | ||
| 1738 | * Refer from | ||
| 1739 | * https://www.virtualbox.org/svn/vbox/trunk/src/VBox/VMM/VMMR0/HMR0.cpp | ||
| 1740 | */ | ||
| 1741 | static u32 vmx_preemption_cpu_tfms[] = { | ||
| 1742 | /* 323344.pdf - BA86 - D0 - Xeon 7500 Series */ | ||
| 1743 | 0x000206E6, | ||
| 1744 | /* 323056.pdf - AAX65 - C2 - Xeon L3406 */ | ||
| 1745 | /* 322814.pdf - AAT59 - C2 - i7-600, i5-500, i5-400 and i3-300 Mobile */ | ||
| 1746 | /* 322911.pdf - AAU65 - C2 - i5-600, i3-500 Desktop and Pentium G6950 */ | ||
| 1747 | 0x00020652, | ||
| 1748 | /* 322911.pdf - AAU65 - K0 - i5-600, i3-500 Desktop and Pentium G6950 */ | ||
| 1749 | 0x00020655, | ||
| 1750 | /* 322373.pdf - AAO95 - B1 - Xeon 3400 Series */ | ||
| 1751 | /* 322166.pdf - AAN92 - B1 - i7-800 and i5-700 Desktop */ | ||
| 1752 | /* | ||
| 1753 | * 320767.pdf - AAP86 - B1 - | ||
| 1754 | * i7-900 Mobile Extreme, i7-800 and i7-700 Mobile | ||
| 1755 | */ | ||
| 1756 | 0x000106E5, | ||
| 1757 | /* 321333.pdf - AAM126 - C0 - Xeon 3500 */ | ||
| 1758 | 0x000106A0, | ||
| 1759 | /* 321333.pdf - AAM126 - C1 - Xeon 3500 */ | ||
| 1760 | 0x000106A1, | ||
| 1761 | /* 320836.pdf - AAJ124 - C0 - i7-900 Desktop Extreme and i7-900 Desktop */ | ||
| 1762 | 0x000106A4, | ||
| 1763 | /* 321333.pdf - AAM126 - D0 - Xeon 3500 */ | ||
| 1764 | /* 321324.pdf - AAK139 - D0 - Xeon 5500 */ | ||
| 1765 | /* 320836.pdf - AAJ124 - D0 - i7-900 Extreme and i7-900 Desktop */ | ||
| 1766 | 0x000106A5, | ||
| 1767 | }; | ||
| 1768 | |||
| 1769 | static inline bool cpu_has_broken_vmx_preemption_timer(void) | ||
| 1770 | { | ||
| 1771 | u32 eax = cpuid_eax(0x00000001), i; | ||
| 1772 | |||
| 1773 | /* Clear the reserved bits */ | ||
| 1774 | eax &= ~(0x3U << 14 | 0xfU << 28); | ||
| 1775 | for (i = 0; i < ARRAY_SIZE(vmx_preemption_cpu_tfms); i++) | ||
| 1776 | if (eax == vmx_preemption_cpu_tfms[i]) | ||
| 1777 | return true; | ||
| 1778 | |||
| 1779 | return false; | ||
| 1780 | } | ||
| 1781 | |||
| 1782 | static inline bool cpu_has_vmx_preemption_timer(void) | ||
| 1783 | { | ||
| 1784 | return vmcs_config.pin_based_exec_ctrl & | ||
| 1785 | PIN_BASED_VMX_PREEMPTION_TIMER; | ||
| 1786 | } | ||
| 1787 | |||
| 1788 | static inline bool cpu_has_vmx_posted_intr(void) | ||
| 1789 | { | ||
| 1790 | return IS_ENABLED(CONFIG_X86_LOCAL_APIC) && | ||
| 1791 | vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR; | ||
| 1792 | } | ||
| 1793 | |||
| 1794 | static inline bool cpu_has_vmx_apicv(void) | ||
| 1795 | { | ||
| 1796 | return cpu_has_vmx_apic_register_virt() && | ||
| 1797 | cpu_has_vmx_virtual_intr_delivery() && | ||
| 1798 | cpu_has_vmx_posted_intr(); | ||
| 1799 | } | ||
| 1800 | |||
| 1801 | static inline bool cpu_has_vmx_flexpriority(void) | ||
| 1802 | { | ||
| 1803 | return cpu_has_vmx_tpr_shadow() && | ||
| 1804 | cpu_has_vmx_virtualize_apic_accesses(); | ||
| 1805 | } | ||
| 1806 | |||
| 1807 | static inline bool cpu_has_vmx_ept_execute_only(void) | ||
| 1808 | { | ||
| 1809 | return vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT; | ||
| 1810 | } | ||
| 1811 | |||
| 1812 | static inline bool cpu_has_vmx_ept_2m_page(void) | ||
| 1813 | { | ||
| 1814 | return vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT; | ||
| 1815 | } | ||
| 1816 | |||
| 1817 | static inline bool cpu_has_vmx_ept_1g_page(void) | ||
| 1818 | { | ||
| 1819 | return vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT; | ||
| 1820 | } | ||
| 1821 | |||
| 1822 | static inline bool cpu_has_vmx_ept_4levels(void) | ||
| 1823 | { | ||
| 1824 | return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT; | ||
| 1825 | } | ||
| 1826 | |||
| 1827 | static inline bool cpu_has_vmx_ept_mt_wb(void) | ||
| 1828 | { | ||
| 1829 | return vmx_capability.ept & VMX_EPTP_WB_BIT; | ||
| 1830 | } | ||
| 1831 | |||
| 1832 | static inline bool cpu_has_vmx_ept_5levels(void) | ||
| 1833 | { | ||
| 1834 | return vmx_capability.ept & VMX_EPT_PAGE_WALK_5_BIT; | ||
| 1835 | } | ||
| 1836 | |||
| 1837 | static inline bool cpu_has_vmx_ept_ad_bits(void) | ||
| 1838 | { | ||
| 1839 | return vmx_capability.ept & VMX_EPT_AD_BIT; | ||
| 1840 | } | ||
| 1841 | |||
| 1842 | static inline bool cpu_has_vmx_invept_context(void) | ||
| 1843 | { | ||
| 1844 | return vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT; | ||
| 1845 | } | ||
| 1846 | |||
| 1847 | static inline bool cpu_has_vmx_invept_global(void) | ||
| 1848 | { | ||
| 1849 | return vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT; | ||
| 1850 | } | ||
| 1851 | |||
| 1852 | static inline bool cpu_has_vmx_invvpid_individual_addr(void) | ||
| 1853 | { | ||
| 1854 | return vmx_capability.vpid & VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT; | ||
| 1855 | } | ||
| 1856 | |||
| 1857 | static inline bool cpu_has_vmx_invvpid_single(void) | ||
| 1858 | { | ||
| 1859 | return vmx_capability.vpid & VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT; | ||
| 1860 | } | ||
| 1861 | |||
| 1862 | static inline bool cpu_has_vmx_invvpid_global(void) | ||
| 1863 | { | ||
| 1864 | return vmx_capability.vpid & VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT; | ||
| 1865 | } | ||
| 1866 | |||
| 1867 | static inline bool cpu_has_vmx_invvpid(void) | ||
| 1868 | { | ||
| 1869 | return vmx_capability.vpid & VMX_VPID_INVVPID_BIT; | ||
| 1870 | } | ||
| 1871 | |||
| 1872 | static inline bool cpu_has_vmx_ept(void) | ||
| 1873 | { | ||
| 1874 | return vmcs_config.cpu_based_2nd_exec_ctrl & | ||
| 1875 | SECONDARY_EXEC_ENABLE_EPT; | ||
| 1876 | } | ||
| 1877 | |||
| 1878 | static inline bool cpu_has_vmx_unrestricted_guest(void) | ||
| 1879 | { | ||
| 1880 | return vmcs_config.cpu_based_2nd_exec_ctrl & | ||
| 1881 | SECONDARY_EXEC_UNRESTRICTED_GUEST; | ||
| 1882 | } | ||
| 1883 | |||
| 1884 | static inline bool cpu_has_vmx_ple(void) | ||
| 1885 | { | ||
| 1886 | return vmcs_config.cpu_based_2nd_exec_ctrl & | ||
| 1887 | SECONDARY_EXEC_PAUSE_LOOP_EXITING; | ||
| 1888 | } | ||
| 1889 | |||
| 1890 | static inline bool cpu_has_vmx_basic_inout(void) | ||
| 1891 | { | ||
| 1892 | return (((u64)vmcs_config.basic_cap << 32) & VMX_BASIC_INOUT); | ||
| 1893 | } | ||
| 1894 | |||
| 1895 | static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu) | ||
| 1896 | { | ||
| 1897 | return flexpriority_enabled && lapic_in_kernel(vcpu); | ||
| 1898 | } | ||
| 1899 | |||
| 1900 | static inline bool cpu_has_vmx_vpid(void) | ||
| 1901 | { | ||
| 1902 | return vmcs_config.cpu_based_2nd_exec_ctrl & | ||
| 1903 | SECONDARY_EXEC_ENABLE_VPID; | ||
| 1904 | } | ||
| 1905 | |||
| 1906 | static inline bool cpu_has_vmx_rdtscp(void) | ||
| 1907 | { | ||
| 1908 | return vmcs_config.cpu_based_2nd_exec_ctrl & | ||
| 1909 | SECONDARY_EXEC_RDTSCP; | ||
| 1910 | } | ||
| 1911 | |||
| 1912 | static inline bool cpu_has_vmx_invpcid(void) | ||
| 1913 | { | ||
| 1914 | return vmcs_config.cpu_based_2nd_exec_ctrl & | ||
| 1915 | SECONDARY_EXEC_ENABLE_INVPCID; | ||
| 1916 | } | ||
| 1917 | |||
| 1918 | static inline bool cpu_has_virtual_nmis(void) | ||
| 1919 | { | ||
| 1920 | return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS; | ||
| 1921 | } | ||
| 1922 | |||
| 1923 | static inline bool cpu_has_vmx_wbinvd_exit(void) | ||
| 1924 | { | ||
| 1925 | return vmcs_config.cpu_based_2nd_exec_ctrl & | ||
| 1926 | SECONDARY_EXEC_WBINVD_EXITING; | ||
| 1927 | } | ||
| 1928 | |||
| 1929 | static inline bool cpu_has_vmx_shadow_vmcs(void) | ||
| 1930 | { | ||
| 1931 | u64 vmx_msr; | ||
| 1932 | rdmsrl(MSR_IA32_VMX_MISC, vmx_msr); | ||
| 1933 | /* check if the cpu supports writing r/o exit information fields */ | ||
| 1934 | if (!(vmx_msr & MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS)) | ||
| 1935 | return false; | ||
| 1936 | |||
| 1937 | return vmcs_config.cpu_based_2nd_exec_ctrl & | ||
| 1938 | SECONDARY_EXEC_SHADOW_VMCS; | ||
| 1939 | } | ||
| 1940 | |||
| 1941 | static inline bool cpu_has_vmx_pml(void) | ||
| 1942 | { | ||
| 1943 | return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_ENABLE_PML; | ||
| 1944 | } | ||
| 1945 | |||
| 1946 | static inline bool cpu_has_vmx_tsc_scaling(void) | ||
| 1947 | { | ||
| 1948 | return vmcs_config.cpu_based_2nd_exec_ctrl & | ||
| 1949 | SECONDARY_EXEC_TSC_SCALING; | ||
| 1950 | } | ||
| 1951 | |||
| 1952 | static inline bool cpu_has_vmx_vmfunc(void) | ||
| 1953 | { | ||
| 1954 | return vmcs_config.cpu_based_2nd_exec_ctrl & | ||
| 1955 | SECONDARY_EXEC_ENABLE_VMFUNC; | ||
| 1956 | } | ||
| 1957 | |||
| 1958 | static bool vmx_umip_emulated(void) | ||
| 1959 | { | ||
| 1960 | return vmcs_config.cpu_based_2nd_exec_ctrl & | ||
| 1961 | SECONDARY_EXEC_DESC; | ||
| 1962 | } | ||
| 1963 | |||
| 1964 | static inline bool report_flexpriority(void) | ||
| 1965 | { | ||
| 1966 | return flexpriority_enabled; | ||
| 1967 | } | ||
| 1968 | |||
| 1969 | static inline unsigned nested_cpu_vmx_misc_cr3_count(struct kvm_vcpu *vcpu) | ||
| 1970 | { | ||
| 1971 | return vmx_misc_cr3_count(to_vmx(vcpu)->nested.msrs.misc_low); | ||
| 1972 | } | ||
| 1973 | |||
| 1974 | /* | ||
| 1975 | * Do the virtual VMX capability MSRs specify that L1 can use VMWRITE | ||
| 1976 | * to modify any valid field of the VMCS, or are the VM-exit | ||
| 1977 | * information fields read-only? | ||
| 1978 | */ | ||
| 1979 | static inline bool nested_cpu_has_vmwrite_any_field(struct kvm_vcpu *vcpu) | ||
| 1980 | { | ||
| 1981 | return to_vmx(vcpu)->nested.msrs.misc_low & | ||
| 1982 | MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS; | ||
| 1983 | } | ||
| 1984 | |||
| 1985 | static inline bool nested_cpu_has_zero_length_injection(struct kvm_vcpu *vcpu) | ||
| 1986 | { | ||
| 1987 | return to_vmx(vcpu)->nested.msrs.misc_low & VMX_MISC_ZERO_LEN_INS; | ||
| 1988 | } | ||
| 1989 | |||
| 1990 | static inline bool nested_cpu_supports_monitor_trap_flag(struct kvm_vcpu *vcpu) | ||
| 1991 | { | ||
| 1992 | return to_vmx(vcpu)->nested.msrs.procbased_ctls_high & | ||
| 1993 | CPU_BASED_MONITOR_TRAP_FLAG; | ||
| 1994 | } | ||
| 1995 | |||
| 1996 | static inline bool nested_cpu_has_vmx_shadow_vmcs(struct kvm_vcpu *vcpu) | ||
| 1997 | { | ||
| 1998 | return to_vmx(vcpu)->nested.msrs.secondary_ctls_high & | ||
| 1999 | SECONDARY_EXEC_SHADOW_VMCS; | ||
| 2000 | } | ||
| 2001 | |||
| 2002 | static inline bool nested_cpu_has(struct vmcs12 *vmcs12, u32 bit) | ||
| 2003 | { | ||
| 2004 | return vmcs12->cpu_based_vm_exec_control & bit; | ||
| 2005 | } | ||
| 2006 | |||
| 2007 | static inline bool nested_cpu_has2(struct vmcs12 *vmcs12, u32 bit) | ||
| 2008 | { | ||
| 2009 | return (vmcs12->cpu_based_vm_exec_control & | ||
| 2010 | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) && | ||
| 2011 | (vmcs12->secondary_vm_exec_control & bit); | ||
| 2012 | } | ||
| 2013 | |||
| 2014 | static inline bool nested_cpu_has_preemption_timer(struct vmcs12 *vmcs12) | ||
| 2015 | { | ||
| 2016 | return vmcs12->pin_based_vm_exec_control & | ||
| 2017 | PIN_BASED_VMX_PREEMPTION_TIMER; | ||
| 2018 | } | ||
| 2019 | |||
| 2020 | static inline bool nested_cpu_has_nmi_exiting(struct vmcs12 *vmcs12) | ||
| 2021 | { | ||
| 2022 | return vmcs12->pin_based_vm_exec_control & PIN_BASED_NMI_EXITING; | ||
| 2023 | } | ||
| 2024 | |||
| 2025 | static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12) | ||
| 2026 | { | ||
| 2027 | return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS; | ||
| 2028 | } | ||
| 2029 | |||
| 2030 | static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12) | ||
| 2031 | { | ||
| 2032 | return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT); | ||
| 2033 | } | ||
| 2034 | |||
| 2035 | static inline bool nested_cpu_has_xsaves(struct vmcs12 *vmcs12) | ||
| 2036 | { | ||
| 2037 | return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES); | ||
| 2038 | } | ||
| 2039 | |||
| 2040 | static inline bool nested_cpu_has_pml(struct vmcs12 *vmcs12) | ||
| 2041 | { | ||
| 2042 | return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_PML); | ||
| 2043 | } | ||
| 2044 | |||
| 2045 | static inline bool nested_cpu_has_virt_x2apic_mode(struct vmcs12 *vmcs12) | ||
| 2046 | { | ||
| 2047 | return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE); | ||
| 2048 | } | ||
| 2049 | |||
| 2050 | static inline bool nested_cpu_has_vpid(struct vmcs12 *vmcs12) | ||
| 2051 | { | ||
| 2052 | return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VPID); | ||
| 2053 | } | ||
| 2054 | |||
| 2055 | static inline bool nested_cpu_has_apic_reg_virt(struct vmcs12 *vmcs12) | ||
| 2056 | { | ||
| 2057 | return nested_cpu_has2(vmcs12, SECONDARY_EXEC_APIC_REGISTER_VIRT); | ||
| 2058 | } | ||
| 2059 | |||
| 2060 | static inline bool nested_cpu_has_vid(struct vmcs12 *vmcs12) | ||
| 2061 | { | ||
| 2062 | return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); | ||
| 2063 | } | ||
| 2064 | |||
| 2065 | static inline bool nested_cpu_has_posted_intr(struct vmcs12 *vmcs12) | ||
| 2066 | { | ||
| 2067 | return vmcs12->pin_based_vm_exec_control & PIN_BASED_POSTED_INTR; | ||
| 2068 | } | ||
| 2069 | |||
| 2070 | static inline bool nested_cpu_has_vmfunc(struct vmcs12 *vmcs12) | ||
| 2071 | { | ||
| 2072 | return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VMFUNC); | ||
| 2073 | } | ||
| 2074 | |||
| 2075 | static inline bool nested_cpu_has_eptp_switching(struct vmcs12 *vmcs12) | ||
| 2076 | { | ||
| 2077 | return nested_cpu_has_vmfunc(vmcs12) && | ||
| 2078 | (vmcs12->vm_function_control & | ||
| 2079 | VMX_VMFUNC_EPTP_SWITCHING); | ||
| 2080 | } | ||
| 2081 | |||
| 2082 | static inline bool nested_cpu_has_shadow_vmcs(struct vmcs12 *vmcs12) | ||
| 2083 | { | ||
| 2084 | return nested_cpu_has2(vmcs12, SECONDARY_EXEC_SHADOW_VMCS); | ||
| 2085 | } | ||
| 2086 | |||
| 2087 | static inline bool is_nmi(u32 intr_info) | ||
| 2088 | { | ||
| 2089 | return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) | ||
| 2090 | == (INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK); | ||
| 2091 | } | ||
| 2092 | |||
| 2093 | static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, | ||
| 2094 | u32 exit_intr_info, | ||
| 2095 | unsigned long exit_qualification); | ||
| 2096 | |||
| 2097 | static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) | ||
| 2098 | { | ||
| 2099 | int i; | ||
| 2100 | |||
| 2101 | for (i = 0; i < vmx->nmsrs; ++i) | ||
| 2102 | if (vmx_msr_index[vmx->guest_msrs[i].index] == msr) | ||
| 2103 | return i; | ||
| 2104 | return -1; | ||
| 2105 | } | ||
| 2106 | |||
| 2107 | static inline void __invvpid(unsigned long ext, u16 vpid, gva_t gva) | ||
| 2108 | { | ||
| 2109 | struct { | ||
| 2110 | u64 vpid : 16; | ||
| 2111 | u64 rsvd : 48; | ||
| 2112 | u64 gva; | ||
| 2113 | } operand = { vpid, 0, gva }; | ||
| 2114 | bool error; | ||
| 2115 | |||
| 2116 | asm volatile (__ex("invvpid %2, %1") CC_SET(na) | ||
| 2117 | : CC_OUT(na) (error) : "r"(ext), "m"(operand)); | ||
| 2118 | BUG_ON(error); | ||
| 2119 | } | ||
| 2120 | |||
| 2121 | static inline void __invept(unsigned long ext, u64 eptp, gpa_t gpa) | ||
| 2122 | { | ||
| 2123 | struct { | ||
| 2124 | u64 eptp, gpa; | ||
| 2125 | } operand = {eptp, gpa}; | ||
| 2126 | bool error; | ||
| 2127 | |||
| 2128 | asm volatile (__ex("invept %2, %1") CC_SET(na) | ||
| 2129 | : CC_OUT(na) (error) : "r"(ext), "m"(operand)); | ||
| 2130 | BUG_ON(error); | ||
| 2131 | } | ||
| 2132 | |||
| 2133 | static struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr) | ||
| 2134 | { | ||
| 2135 | int i; | ||
| 2136 | |||
| 2137 | i = __find_msr_index(vmx, msr); | ||
| 2138 | if (i >= 0) | ||
| 2139 | return &vmx->guest_msrs[i]; | ||
| 2140 | return NULL; | ||
| 2141 | } | ||
| 2142 | |||
| 2143 | static void vmcs_clear(struct vmcs *vmcs) | ||
| 2144 | { | ||
| 2145 | u64 phys_addr = __pa(vmcs); | ||
| 2146 | bool error; | ||
| 2147 | |||
| 2148 | asm volatile (__ex("vmclear %1") CC_SET(na) | ||
| 2149 | : CC_OUT(na) (error) : "m"(phys_addr)); | ||
| 2150 | if (unlikely(error)) | ||
| 2151 | printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n", | ||
| 2152 | vmcs, phys_addr); | ||
| 2153 | } | ||
| 2154 | |||
| 2155 | static inline void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs) | ||
| 2156 | { | ||
| 2157 | vmcs_clear(loaded_vmcs->vmcs); | ||
| 2158 | if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched) | ||
| 2159 | vmcs_clear(loaded_vmcs->shadow_vmcs); | ||
| 2160 | loaded_vmcs->cpu = -1; | ||
| 2161 | loaded_vmcs->launched = 0; | ||
| 2162 | } | ||
| 2163 | |||
| 2164 | static void vmcs_load(struct vmcs *vmcs) | ||
| 2165 | { | ||
| 2166 | u64 phys_addr = __pa(vmcs); | ||
| 2167 | bool error; | ||
| 2168 | |||
| 2169 | if (static_branch_unlikely(&enable_evmcs)) | ||
| 2170 | return evmcs_load(phys_addr); | ||
| 2171 | |||
| 2172 | asm volatile (__ex("vmptrld %1") CC_SET(na) | ||
| 2173 | : CC_OUT(na) (error) : "m"(phys_addr)); | ||
| 2174 | if (unlikely(error)) | ||
| 2175 | printk(KERN_ERR "kvm: vmptrld %p/%llx failed\n", | ||
| 2176 | vmcs, phys_addr); | ||
| 2177 | } | ||
| 2178 | |||
| 2179 | #ifdef CONFIG_KEXEC_CORE | ||
| 2180 | /* | ||
| 2181 | * This bitmap is used to indicate whether the vmclear | ||
| 2182 | * operation is enabled on all cpus. All disabled by | ||
| 2183 | * default. | ||
| 2184 | */ | ||
| 2185 | static cpumask_t crash_vmclear_enabled_bitmap = CPU_MASK_NONE; | ||
| 2186 | |||
| 2187 | static inline void crash_enable_local_vmclear(int cpu) | ||
| 2188 | { | ||
| 2189 | cpumask_set_cpu(cpu, &crash_vmclear_enabled_bitmap); | ||
| 2190 | } | ||
| 2191 | |||
| 2192 | static inline void crash_disable_local_vmclear(int cpu) | ||
| 2193 | { | ||
| 2194 | cpumask_clear_cpu(cpu, &crash_vmclear_enabled_bitmap); | ||
| 2195 | } | ||
| 2196 | |||
| 2197 | static inline int crash_local_vmclear_enabled(int cpu) | ||
| 2198 | { | ||
| 2199 | return cpumask_test_cpu(cpu, &crash_vmclear_enabled_bitmap); | ||
| 2200 | } | ||
| 2201 | |||
| 2202 | static void crash_vmclear_local_loaded_vmcss(void) | ||
| 2203 | { | ||
| 2204 | int cpu = raw_smp_processor_id(); | ||
| 2205 | struct loaded_vmcs *v; | ||
| 2206 | |||
| 2207 | if (!crash_local_vmclear_enabled(cpu)) | ||
| 2208 | return; | ||
| 2209 | |||
| 2210 | list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu), | ||
| 2211 | loaded_vmcss_on_cpu_link) | ||
| 2212 | vmcs_clear(v->vmcs); | ||
| 2213 | } | ||
| 2214 | #else | ||
| 2215 | static inline void crash_enable_local_vmclear(int cpu) { } | ||
| 2216 | static inline void crash_disable_local_vmclear(int cpu) { } | ||
| 2217 | #endif /* CONFIG_KEXEC_CORE */ | ||
| 2218 | |||
| 2219 | static void __loaded_vmcs_clear(void *arg) | ||
| 2220 | { | ||
| 2221 | struct loaded_vmcs *loaded_vmcs = arg; | ||
| 2222 | int cpu = raw_smp_processor_id(); | ||
| 2223 | |||
| 2224 | if (loaded_vmcs->cpu != cpu) | ||
| 2225 | return; /* vcpu migration can race with cpu offline */ | ||
| 2226 | if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs) | ||
| 2227 | per_cpu(current_vmcs, cpu) = NULL; | ||
| 2228 | crash_disable_local_vmclear(cpu); | ||
| 2229 | list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link); | ||
| 2230 | |||
| 2231 | /* | ||
| 2232 | * we should ensure updating loaded_vmcs->loaded_vmcss_on_cpu_link | ||
| 2233 | * is before setting loaded_vmcs->vcpu to -1 which is done in | ||
| 2234 | * loaded_vmcs_init. Otherwise, other cpu can see vcpu = -1 fist | ||
| 2235 | * then adds the vmcs into percpu list before it is deleted. | ||
| 2236 | */ | ||
| 2237 | smp_wmb(); | ||
| 2238 | |||
| 2239 | loaded_vmcs_init(loaded_vmcs); | ||
| 2240 | crash_enable_local_vmclear(cpu); | ||
| 2241 | } | ||
| 2242 | |||
| 2243 | static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs) | ||
| 2244 | { | ||
| 2245 | int cpu = loaded_vmcs->cpu; | ||
| 2246 | |||
| 2247 | if (cpu != -1) | ||
| 2248 | smp_call_function_single(cpu, | ||
| 2249 | __loaded_vmcs_clear, loaded_vmcs, 1); | ||
| 2250 | } | ||
| 2251 | |||
| 2252 | static inline bool vpid_sync_vcpu_addr(int vpid, gva_t addr) | ||
| 2253 | { | ||
| 2254 | if (vpid == 0) | ||
| 2255 | return true; | ||
| 2256 | |||
| 2257 | if (cpu_has_vmx_invvpid_individual_addr()) { | ||
| 2258 | __invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR, vpid, addr); | ||
| 2259 | return true; | ||
| 2260 | } | ||
| 2261 | |||
| 2262 | return false; | ||
| 2263 | } | ||
| 2264 | |||
| 2265 | static inline void vpid_sync_vcpu_single(int vpid) | ||
| 2266 | { | ||
| 2267 | if (vpid == 0) | ||
| 2268 | return; | ||
| 2269 | |||
| 2270 | if (cpu_has_vmx_invvpid_single()) | ||
| 2271 | __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vpid, 0); | ||
| 2272 | } | ||
| 2273 | |||
| 2274 | static inline void vpid_sync_vcpu_global(void) | ||
| 2275 | { | ||
| 2276 | if (cpu_has_vmx_invvpid_global()) | ||
| 2277 | __invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0); | ||
| 2278 | } | ||
| 2279 | |||
| 2280 | static inline void vpid_sync_context(int vpid) | ||
| 2281 | { | ||
| 2282 | if (cpu_has_vmx_invvpid_single()) | ||
| 2283 | vpid_sync_vcpu_single(vpid); | ||
| 2284 | else | ||
| 2285 | vpid_sync_vcpu_global(); | ||
| 2286 | } | ||
| 2287 | |||
| 2288 | static inline void ept_sync_global(void) | ||
| 2289 | { | ||
| 2290 | __invept(VMX_EPT_EXTENT_GLOBAL, 0, 0); | ||
| 2291 | } | ||
| 2292 | |||
| 2293 | static inline void ept_sync_context(u64 eptp) | ||
| 2294 | { | ||
| 2295 | if (cpu_has_vmx_invept_context()) | ||
| 2296 | __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0); | ||
| 2297 | else | ||
| 2298 | ept_sync_global(); | ||
| 2299 | } | ||
| 2300 | |||
| 2301 | static __always_inline void vmcs_check16(unsigned long field) | ||
| 2302 | { | ||
| 2303 | BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2000, | ||
| 2304 | "16-bit accessor invalid for 64-bit field"); | ||
| 2305 | BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001, | ||
| 2306 | "16-bit accessor invalid for 64-bit high field"); | ||
| 2307 | BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000, | ||
| 2308 | "16-bit accessor invalid for 32-bit high field"); | ||
| 2309 | BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000, | ||
| 2310 | "16-bit accessor invalid for natural width field"); | ||
| 2311 | } | ||
| 2312 | |||
| 2313 | static __always_inline void vmcs_check32(unsigned long field) | ||
| 2314 | { | ||
| 2315 | BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0, | ||
| 2316 | "32-bit accessor invalid for 16-bit field"); | ||
| 2317 | BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000, | ||
| 2318 | "32-bit accessor invalid for natural width field"); | ||
| 2319 | } | ||
| 2320 | |||
| 2321 | static __always_inline void vmcs_check64(unsigned long field) | ||
| 2322 | { | ||
| 2323 | BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0, | ||
| 2324 | "64-bit accessor invalid for 16-bit field"); | ||
| 2325 | BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001, | ||
| 2326 | "64-bit accessor invalid for 64-bit high field"); | ||
| 2327 | BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000, | ||
| 2328 | "64-bit accessor invalid for 32-bit field"); | ||
| 2329 | BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000, | ||
| 2330 | "64-bit accessor invalid for natural width field"); | ||
| 2331 | } | ||
| 2332 | |||
| 2333 | static __always_inline void vmcs_checkl(unsigned long field) | ||
| 2334 | { | ||
| 2335 | BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0, | ||
| 2336 | "Natural width accessor invalid for 16-bit field"); | ||
| 2337 | BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2000, | ||
| 2338 | "Natural width accessor invalid for 64-bit field"); | ||
| 2339 | BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001, | ||
| 2340 | "Natural width accessor invalid for 64-bit high field"); | ||
| 2341 | BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000, | ||
| 2342 | "Natural width accessor invalid for 32-bit field"); | ||
| 2343 | } | ||
| 2344 | |||
| 2345 | static __always_inline unsigned long __vmcs_readl(unsigned long field) | ||
| 2346 | { | ||
| 2347 | unsigned long value; | ||
| 2348 | |||
| 2349 | asm volatile (__ex_clear("vmread %1, %0", "%k0") | ||
| 2350 | : "=r"(value) : "r"(field)); | ||
| 2351 | return value; | ||
| 2352 | } | ||
| 2353 | |||
| 2354 | static __always_inline u16 vmcs_read16(unsigned long field) | ||
| 2355 | { | ||
| 2356 | vmcs_check16(field); | ||
| 2357 | if (static_branch_unlikely(&enable_evmcs)) | ||
| 2358 | return evmcs_read16(field); | ||
| 2359 | return __vmcs_readl(field); | ||
| 2360 | } | ||
| 2361 | |||
| 2362 | static __always_inline u32 vmcs_read32(unsigned long field) | ||
| 2363 | { | ||
| 2364 | vmcs_check32(field); | ||
| 2365 | if (static_branch_unlikely(&enable_evmcs)) | ||
| 2366 | return evmcs_read32(field); | ||
| 2367 | return __vmcs_readl(field); | ||
| 2368 | } | ||
| 2369 | |||
| 2370 | static __always_inline u64 vmcs_read64(unsigned long field) | ||
| 2371 | { | ||
| 2372 | vmcs_check64(field); | ||
| 2373 | if (static_branch_unlikely(&enable_evmcs)) | ||
| 2374 | return evmcs_read64(field); | ||
| 2375 | #ifdef CONFIG_X86_64 | ||
| 2376 | return __vmcs_readl(field); | ||
| 2377 | #else | ||
| 2378 | return __vmcs_readl(field) | ((u64)__vmcs_readl(field+1) << 32); | ||
| 2379 | #endif | ||
| 2380 | } | ||
| 2381 | |||
| 2382 | static __always_inline unsigned long vmcs_readl(unsigned long field) | ||
| 2383 | { | ||
| 2384 | vmcs_checkl(field); | ||
| 2385 | if (static_branch_unlikely(&enable_evmcs)) | ||
| 2386 | return evmcs_read64(field); | ||
| 2387 | return __vmcs_readl(field); | ||
| 2388 | } | ||
| 2389 | |||
| 2390 | static noinline void vmwrite_error(unsigned long field, unsigned long value) | ||
| 2391 | { | ||
| 2392 | printk(KERN_ERR "vmwrite error: reg %lx value %lx (err %d)\n", | ||
| 2393 | field, value, vmcs_read32(VM_INSTRUCTION_ERROR)); | ||
| 2394 | dump_stack(); | ||
| 2395 | } | ||
| 2396 | |||
| 2397 | static __always_inline void __vmcs_writel(unsigned long field, unsigned long value) | ||
| 2398 | { | ||
| 2399 | bool error; | ||
| 2400 | |||
| 2401 | asm volatile (__ex("vmwrite %2, %1") CC_SET(na) | ||
| 2402 | : CC_OUT(na) (error) : "r"(field), "rm"(value)); | ||
| 2403 | if (unlikely(error)) | ||
| 2404 | vmwrite_error(field, value); | ||
| 2405 | } | ||
| 2406 | |||
| 2407 | static __always_inline void vmcs_write16(unsigned long field, u16 value) | ||
| 2408 | { | ||
| 2409 | vmcs_check16(field); | ||
| 2410 | if (static_branch_unlikely(&enable_evmcs)) | ||
| 2411 | return evmcs_write16(field, value); | ||
| 2412 | |||
| 2413 | __vmcs_writel(field, value); | ||
| 2414 | } | ||
| 2415 | |||
| 2416 | static __always_inline void vmcs_write32(unsigned long field, u32 value) | ||
| 2417 | { | ||
| 2418 | vmcs_check32(field); | ||
| 2419 | if (static_branch_unlikely(&enable_evmcs)) | ||
| 2420 | return evmcs_write32(field, value); | ||
| 2421 | |||
| 2422 | __vmcs_writel(field, value); | ||
| 2423 | } | ||
| 2424 | |||
| 2425 | static __always_inline void vmcs_write64(unsigned long field, u64 value) | ||
| 2426 | { | ||
| 2427 | vmcs_check64(field); | ||
| 2428 | if (static_branch_unlikely(&enable_evmcs)) | ||
| 2429 | return evmcs_write64(field, value); | ||
| 2430 | |||
| 2431 | __vmcs_writel(field, value); | ||
| 2432 | #ifndef CONFIG_X86_64 | ||
| 2433 | asm volatile (""); | ||
| 2434 | __vmcs_writel(field+1, value >> 32); | ||
| 2435 | #endif | ||
| 2436 | } | ||
| 2437 | |||
| 2438 | static __always_inline void vmcs_writel(unsigned long field, unsigned long value) | ||
| 2439 | { | ||
| 2440 | vmcs_checkl(field); | ||
| 2441 | if (static_branch_unlikely(&enable_evmcs)) | ||
| 2442 | return evmcs_write64(field, value); | ||
| 2443 | |||
| 2444 | __vmcs_writel(field, value); | ||
| 2445 | } | ||
| 2446 | |||
| 2447 | static __always_inline void vmcs_clear_bits(unsigned long field, u32 mask) | ||
| 2448 | { | ||
| 2449 | BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000, | ||
| 2450 | "vmcs_clear_bits does not support 64-bit fields"); | ||
| 2451 | if (static_branch_unlikely(&enable_evmcs)) | ||
| 2452 | return evmcs_write32(field, evmcs_read32(field) & ~mask); | ||
| 2453 | |||
| 2454 | __vmcs_writel(field, __vmcs_readl(field) & ~mask); | ||
| 2455 | } | ||
| 2456 | |||
| 2457 | static __always_inline void vmcs_set_bits(unsigned long field, u32 mask) | ||
| 2458 | { | ||
| 2459 | BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000, | ||
| 2460 | "vmcs_set_bits does not support 64-bit fields"); | ||
| 2461 | if (static_branch_unlikely(&enable_evmcs)) | ||
| 2462 | return evmcs_write32(field, evmcs_read32(field) | mask); | ||
| 2463 | |||
| 2464 | __vmcs_writel(field, __vmcs_readl(field) | mask); | ||
| 2465 | } | ||
| 2466 | |||
| 2467 | static inline void vm_entry_controls_reset_shadow(struct vcpu_vmx *vmx) | ||
| 2468 | { | ||
| 2469 | vmx->vm_entry_controls_shadow = vmcs_read32(VM_ENTRY_CONTROLS); | ||
| 2470 | } | ||
| 2471 | |||
| 2472 | static inline void vm_entry_controls_init(struct vcpu_vmx *vmx, u32 val) | ||
| 2473 | { | ||
| 2474 | vmcs_write32(VM_ENTRY_CONTROLS, val); | ||
| 2475 | vmx->vm_entry_controls_shadow = val; | ||
| 2476 | } | ||
| 2477 | |||
| 2478 | static inline void vm_entry_controls_set(struct vcpu_vmx *vmx, u32 val) | ||
| 2479 | { | ||
| 2480 | if (vmx->vm_entry_controls_shadow != val) | ||
| 2481 | vm_entry_controls_init(vmx, val); | ||
| 2482 | } | ||
| 2483 | |||
| 2484 | static inline u32 vm_entry_controls_get(struct vcpu_vmx *vmx) | ||
| 2485 | { | ||
| 2486 | return vmx->vm_entry_controls_shadow; | ||
| 2487 | } | ||
| 2488 | |||
| 2489 | |||
| 2490 | static inline void vm_entry_controls_setbit(struct vcpu_vmx *vmx, u32 val) | ||
| 2491 | { | ||
| 2492 | vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) | val); | ||
| 2493 | } | ||
| 2494 | |||
| 2495 | static inline void vm_entry_controls_clearbit(struct vcpu_vmx *vmx, u32 val) | ||
| 2496 | { | ||
| 2497 | vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) & ~val); | ||
| 2498 | } | ||
| 2499 | |||
| 2500 | static inline void vm_exit_controls_reset_shadow(struct vcpu_vmx *vmx) | ||
| 2501 | { | ||
| 2502 | vmx->vm_exit_controls_shadow = vmcs_read32(VM_EXIT_CONTROLS); | ||
| 2503 | } | ||
| 2504 | |||
| 2505 | static inline void vm_exit_controls_init(struct vcpu_vmx *vmx, u32 val) | ||
| 2506 | { | ||
| 2507 | vmcs_write32(VM_EXIT_CONTROLS, val); | ||
| 2508 | vmx->vm_exit_controls_shadow = val; | ||
| 2509 | } | ||
| 2510 | |||
| 2511 | static inline void vm_exit_controls_set(struct vcpu_vmx *vmx, u32 val) | ||
| 2512 | { | ||
| 2513 | if (vmx->vm_exit_controls_shadow != val) | ||
| 2514 | vm_exit_controls_init(vmx, val); | ||
| 2515 | } | ||
| 2516 | |||
| 2517 | static inline u32 vm_exit_controls_get(struct vcpu_vmx *vmx) | ||
| 2518 | { | ||
| 2519 | return vmx->vm_exit_controls_shadow; | ||
| 2520 | } | ||
| 2521 | |||
| 2522 | |||
| 2523 | static inline void vm_exit_controls_setbit(struct vcpu_vmx *vmx, u32 val) | ||
| 2524 | { | ||
| 2525 | vm_exit_controls_set(vmx, vm_exit_controls_get(vmx) | val); | ||
| 2526 | } | ||
| 2527 | |||
| 2528 | static inline void vm_exit_controls_clearbit(struct vcpu_vmx *vmx, u32 val) | ||
| 2529 | { | ||
| 2530 | vm_exit_controls_set(vmx, vm_exit_controls_get(vmx) & ~val); | ||
| 2531 | } | ||
| 2532 | |||
| 2533 | static void vmx_segment_cache_clear(struct vcpu_vmx *vmx) | ||
| 2534 | { | ||
| 2535 | vmx->segment_cache.bitmask = 0; | ||
| 2536 | } | ||
| 2537 | |||
| 2538 | static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg, | ||
| 2539 | unsigned field) | ||
| 2540 | { | ||
| 2541 | bool ret; | ||
| 2542 | u32 mask = 1 << (seg * SEG_FIELD_NR + field); | ||
| 2543 | |||
| 2544 | if (!(vmx->vcpu.arch.regs_avail & (1 << VCPU_EXREG_SEGMENTS))) { | ||
| 2545 | vmx->vcpu.arch.regs_avail |= (1 << VCPU_EXREG_SEGMENTS); | ||
| 2546 | vmx->segment_cache.bitmask = 0; | ||
| 2547 | } | ||
| 2548 | ret = vmx->segment_cache.bitmask & mask; | ||
| 2549 | vmx->segment_cache.bitmask |= mask; | ||
| 2550 | return ret; | ||
| 2551 | } | ||
| 2552 | |||
| 2553 | static u16 vmx_read_guest_seg_selector(struct vcpu_vmx *vmx, unsigned seg) | ||
| 2554 | { | ||
| 2555 | u16 *p = &vmx->segment_cache.seg[seg].selector; | ||
| 2556 | |||
| 2557 | if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_SEL)) | ||
| 2558 | *p = vmcs_read16(kvm_vmx_segment_fields[seg].selector); | ||
| 2559 | return *p; | ||
| 2560 | } | ||
| 2561 | |||
| 2562 | static ulong vmx_read_guest_seg_base(struct vcpu_vmx *vmx, unsigned seg) | ||
| 2563 | { | ||
| 2564 | ulong *p = &vmx->segment_cache.seg[seg].base; | ||
| 2565 | |||
| 2566 | if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_BASE)) | ||
| 2567 | *p = vmcs_readl(kvm_vmx_segment_fields[seg].base); | ||
| 2568 | return *p; | ||
| 2569 | } | ||
| 2570 | |||
| 2571 | static u32 vmx_read_guest_seg_limit(struct vcpu_vmx *vmx, unsigned seg) | ||
| 2572 | { | ||
| 2573 | u32 *p = &vmx->segment_cache.seg[seg].limit; | ||
| 2574 | |||
| 2575 | if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_LIMIT)) | ||
| 2576 | *p = vmcs_read32(kvm_vmx_segment_fields[seg].limit); | ||
| 2577 | return *p; | ||
| 2578 | } | ||
| 2579 | |||
| 2580 | static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg) | ||
| 2581 | { | ||
| 2582 | u32 *p = &vmx->segment_cache.seg[seg].ar; | ||
| 2583 | |||
| 2584 | if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_AR)) | ||
| 2585 | *p = vmcs_read32(kvm_vmx_segment_fields[seg].ar_bytes); | ||
| 2586 | return *p; | ||
| 2587 | } | ||
| 2588 | |||
| 2589 | static void update_exception_bitmap(struct kvm_vcpu *vcpu) | ||
| 2590 | { | ||
| 2591 | u32 eb; | ||
| 2592 | |||
| 2593 | eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) | | ||
| 2594 | (1u << DB_VECTOR) | (1u << AC_VECTOR); | ||
| 2595 | /* | ||
| 2596 | * Guest access to VMware backdoor ports could legitimately | ||
| 2597 | * trigger #GP because of TSS I/O permission bitmap. | ||
| 2598 | * We intercept those #GP and allow access to them anyway | ||
| 2599 | * as VMware does. | ||
| 2600 | */ | ||
| 2601 | if (enable_vmware_backdoor) | ||
| 2602 | eb |= (1u << GP_VECTOR); | ||
| 2603 | if ((vcpu->guest_debug & | ||
| 2604 | (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) == | ||
| 2605 | (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) | ||
| 2606 | eb |= 1u << BP_VECTOR; | ||
| 2607 | if (to_vmx(vcpu)->rmode.vm86_active) | ||
| 2608 | eb = ~0; | ||
| 2609 | if (enable_ept) | ||
| 2610 | eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */ | ||
| 2611 | |||
| 2612 | /* When we are running a nested L2 guest and L1 specified for it a | ||
| 2613 | * certain exception bitmap, we must trap the same exceptions and pass | ||
| 2614 | * them to L1. When running L2, we will only handle the exceptions | ||
| 2615 | * specified above if L1 did not want them. | ||
| 2616 | */ | ||
| 2617 | if (is_guest_mode(vcpu)) | ||
| 2618 | eb |= get_vmcs12(vcpu)->exception_bitmap; | ||
| 2619 | |||
| 2620 | vmcs_write32(EXCEPTION_BITMAP, eb); | ||
| 2621 | } | ||
| 2622 | |||
| 2623 | /* | ||
| 2624 | * Check if MSR is intercepted for currently loaded MSR bitmap. | ||
| 2625 | */ | ||
| 2626 | static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr) | ||
| 2627 | { | ||
| 2628 | unsigned long *msr_bitmap; | ||
| 2629 | int f = sizeof(unsigned long); | ||
| 2630 | |||
| 2631 | if (!cpu_has_vmx_msr_bitmap()) | ||
| 2632 | return true; | ||
| 2633 | |||
| 2634 | msr_bitmap = to_vmx(vcpu)->loaded_vmcs->msr_bitmap; | ||
| 2635 | |||
| 2636 | if (msr <= 0x1fff) { | ||
| 2637 | return !!test_bit(msr, msr_bitmap + 0x800 / f); | ||
| 2638 | } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { | ||
| 2639 | msr &= 0x1fff; | ||
| 2640 | return !!test_bit(msr, msr_bitmap + 0xc00 / f); | ||
| 2641 | } | ||
| 2642 | |||
| 2643 | return true; | ||
| 2644 | } | ||
| 2645 | |||
| 2646 | /* | ||
| 2647 | * Check if MSR is intercepted for L01 MSR bitmap. | ||
| 2648 | */ | ||
| 2649 | static bool msr_write_intercepted_l01(struct kvm_vcpu *vcpu, u32 msr) | ||
| 2650 | { | ||
| 2651 | unsigned long *msr_bitmap; | ||
| 2652 | int f = sizeof(unsigned long); | ||
| 2653 | |||
| 2654 | if (!cpu_has_vmx_msr_bitmap()) | ||
| 2655 | return true; | ||
| 2656 | |||
| 2657 | msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap; | ||
| 2658 | |||
| 2659 | if (msr <= 0x1fff) { | ||
| 2660 | return !!test_bit(msr, msr_bitmap + 0x800 / f); | ||
| 2661 | } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { | ||
| 2662 | msr &= 0x1fff; | ||
| 2663 | return !!test_bit(msr, msr_bitmap + 0xc00 / f); | ||
| 2664 | } | ||
| 2665 | |||
| 2666 | return true; | ||
| 2667 | } | ||
| 2668 | |||
| 2669 | static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx, | ||
| 2670 | unsigned long entry, unsigned long exit) | ||
| 2671 | { | ||
| 2672 | vm_entry_controls_clearbit(vmx, entry); | ||
| 2673 | vm_exit_controls_clearbit(vmx, exit); | ||
| 2674 | } | ||
| 2675 | |||
| 2676 | static int find_msr(struct vmx_msrs *m, unsigned int msr) | ||
| 2677 | { | ||
| 2678 | unsigned int i; | ||
| 2679 | |||
| 2680 | for (i = 0; i < m->nr; ++i) { | ||
| 2681 | if (m->val[i].index == msr) | ||
| 2682 | return i; | ||
| 2683 | } | ||
| 2684 | return -ENOENT; | ||
| 2685 | } | ||
| 2686 | |||
| 2687 | static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr) | ||
| 2688 | { | ||
| 2689 | int i; | ||
| 2690 | struct msr_autoload *m = &vmx->msr_autoload; | ||
| 2691 | |||
| 2692 | switch (msr) { | ||
| 2693 | case MSR_EFER: | ||
| 2694 | if (cpu_has_load_ia32_efer) { | ||
| 2695 | clear_atomic_switch_msr_special(vmx, | ||
| 2696 | VM_ENTRY_LOAD_IA32_EFER, | ||
| 2697 | VM_EXIT_LOAD_IA32_EFER); | ||
| 2698 | return; | ||
| 2699 | } | ||
| 2700 | break; | ||
| 2701 | case MSR_CORE_PERF_GLOBAL_CTRL: | ||
| 2702 | if (cpu_has_load_perf_global_ctrl) { | ||
| 2703 | clear_atomic_switch_msr_special(vmx, | ||
| 2704 | VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, | ||
| 2705 | VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL); | ||
| 2706 | return; | ||
| 2707 | } | ||
| 2708 | break; | ||
| 2709 | } | ||
| 2710 | i = find_msr(&m->guest, msr); | ||
| 2711 | if (i < 0) | ||
| 2712 | goto skip_guest; | ||
| 2713 | --m->guest.nr; | ||
| 2714 | m->guest.val[i] = m->guest.val[m->guest.nr]; | ||
| 2715 | vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr); | ||
| 2716 | |||
| 2717 | skip_guest: | ||
| 2718 | i = find_msr(&m->host, msr); | ||
| 2719 | if (i < 0) | ||
| 2720 | return; | ||
| 2721 | |||
| 2722 | --m->host.nr; | ||
| 2723 | m->host.val[i] = m->host.val[m->host.nr]; | ||
| 2724 | vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr); | ||
| 2725 | } | ||
| 2726 | |||
| 2727 | static void add_atomic_switch_msr_special(struct vcpu_vmx *vmx, | ||
| 2728 | unsigned long entry, unsigned long exit, | ||
| 2729 | unsigned long guest_val_vmcs, unsigned long host_val_vmcs, | ||
| 2730 | u64 guest_val, u64 host_val) | ||
| 2731 | { | ||
| 2732 | vmcs_write64(guest_val_vmcs, guest_val); | ||
| 2733 | if (host_val_vmcs != HOST_IA32_EFER) | ||
| 2734 | vmcs_write64(host_val_vmcs, host_val); | ||
| 2735 | vm_entry_controls_setbit(vmx, entry); | ||
| 2736 | vm_exit_controls_setbit(vmx, exit); | ||
| 2737 | } | ||
| 2738 | |||
| 2739 | static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, | ||
| 2740 | u64 guest_val, u64 host_val, bool entry_only) | ||
| 2741 | { | ||
| 2742 | int i, j = 0; | ||
| 2743 | struct msr_autoload *m = &vmx->msr_autoload; | ||
| 2744 | |||
| 2745 | switch (msr) { | ||
| 2746 | case MSR_EFER: | ||
| 2747 | if (cpu_has_load_ia32_efer) { | ||
| 2748 | add_atomic_switch_msr_special(vmx, | ||
| 2749 | VM_ENTRY_LOAD_IA32_EFER, | ||
| 2750 | VM_EXIT_LOAD_IA32_EFER, | ||
| 2751 | GUEST_IA32_EFER, | ||
| 2752 | HOST_IA32_EFER, | ||
| 2753 | guest_val, host_val); | ||
| 2754 | return; | ||
| 2755 | } | ||
| 2756 | break; | ||
| 2757 | case MSR_CORE_PERF_GLOBAL_CTRL: | ||
| 2758 | if (cpu_has_load_perf_global_ctrl) { | ||
| 2759 | add_atomic_switch_msr_special(vmx, | ||
| 2760 | VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, | ||
| 2761 | VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL, | ||
| 2762 | GUEST_IA32_PERF_GLOBAL_CTRL, | ||
| 2763 | HOST_IA32_PERF_GLOBAL_CTRL, | ||
| 2764 | guest_val, host_val); | ||
| 2765 | return; | ||
| 2766 | } | ||
| 2767 | break; | ||
| 2768 | case MSR_IA32_PEBS_ENABLE: | ||
| 2769 | /* PEBS needs a quiescent period after being disabled (to write | ||
| 2770 | * a record). Disabling PEBS through VMX MSR swapping doesn't | ||
| 2771 | * provide that period, so a CPU could write host's record into | ||
| 2772 | * guest's memory. | ||
| 2773 | */ | ||
| 2774 | wrmsrl(MSR_IA32_PEBS_ENABLE, 0); | ||
| 2775 | } | ||
| 2776 | |||
| 2777 | i = find_msr(&m->guest, msr); | ||
| 2778 | if (!entry_only) | ||
| 2779 | j = find_msr(&m->host, msr); | ||
| 2780 | |||
| 2781 | if (i == NR_AUTOLOAD_MSRS || j == NR_AUTOLOAD_MSRS) { | ||
| 2782 | printk_once(KERN_WARNING "Not enough msr switch entries. " | ||
| 2783 | "Can't add msr %x\n", msr); | ||
| 2784 | return; | ||
| 2785 | } | ||
| 2786 | if (i < 0) { | ||
| 2787 | i = m->guest.nr++; | ||
| 2788 | vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr); | ||
| 2789 | } | ||
| 2790 | m->guest.val[i].index = msr; | ||
| 2791 | m->guest.val[i].value = guest_val; | ||
| 2792 | |||
| 2793 | if (entry_only) | ||
| 2794 | return; | ||
| 2795 | |||
| 2796 | if (j < 0) { | ||
| 2797 | j = m->host.nr++; | ||
| 2798 | vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr); | ||
| 2799 | } | ||
| 2800 | m->host.val[j].index = msr; | ||
| 2801 | m->host.val[j].value = host_val; | ||
| 2802 | } | ||
| 2803 | |||
| 2804 | static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset) | ||
| 2805 | { | ||
| 2806 | u64 guest_efer = vmx->vcpu.arch.efer; | ||
| 2807 | u64 ignore_bits = 0; | ||
| 2808 | |||
| 2809 | if (!enable_ept) { | ||
| 2810 | /* | ||
| 2811 | * NX is needed to handle CR0.WP=1, CR4.SMEP=1. Testing | ||
| 2812 | * host CPUID is more efficient than testing guest CPUID | ||
| 2813 | * or CR4. Host SMEP is anyway a requirement for guest SMEP. | ||
| 2814 | */ | ||
| 2815 | if (boot_cpu_has(X86_FEATURE_SMEP)) | ||
| 2816 | guest_efer |= EFER_NX; | ||
| 2817 | else if (!(guest_efer & EFER_NX)) | ||
| 2818 | ignore_bits |= EFER_NX; | ||
| 2819 | } | ||
| 2820 | |||
| 2821 | /* | ||
| 2822 | * LMA and LME handled by hardware; SCE meaningless outside long mode. | ||
| 2823 | */ | ||
| 2824 | ignore_bits |= EFER_SCE; | ||
| 2825 | #ifdef CONFIG_X86_64 | ||
| 2826 | ignore_bits |= EFER_LMA | EFER_LME; | ||
| 2827 | /* SCE is meaningful only in long mode on Intel */ | ||
| 2828 | if (guest_efer & EFER_LMA) | ||
| 2829 | ignore_bits &= ~(u64)EFER_SCE; | ||
| 2830 | #endif | ||
| 2831 | |||
| 2832 | /* | ||
| 2833 | * On EPT, we can't emulate NX, so we must switch EFER atomically. | ||
| 2834 | * On CPUs that support "load IA32_EFER", always switch EFER | ||
| 2835 | * atomically, since it's faster than switching it manually. | ||
| 2836 | */ | ||
| 2837 | if (cpu_has_load_ia32_efer || | ||
| 2838 | (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX))) { | ||
| 2839 | if (!(guest_efer & EFER_LMA)) | ||
| 2840 | guest_efer &= ~EFER_LME; | ||
| 2841 | if (guest_efer != host_efer) | ||
| 2842 | add_atomic_switch_msr(vmx, MSR_EFER, | ||
| 2843 | guest_efer, host_efer, false); | ||
| 2844 | else | ||
| 2845 | clear_atomic_switch_msr(vmx, MSR_EFER); | ||
| 2846 | return false; | ||
| 2847 | } else { | ||
| 2848 | clear_atomic_switch_msr(vmx, MSR_EFER); | ||
| 2849 | |||
| 2850 | guest_efer &= ~ignore_bits; | ||
| 2851 | guest_efer |= host_efer & ignore_bits; | ||
| 2852 | |||
| 2853 | vmx->guest_msrs[efer_offset].data = guest_efer; | ||
| 2854 | vmx->guest_msrs[efer_offset].mask = ~ignore_bits; | ||
| 2855 | |||
| 2856 | return true; | ||
| 2857 | } | ||
| 2858 | } | ||
| 2859 | |||
| 2860 | #ifdef CONFIG_X86_32 | ||
| 2861 | /* | ||
| 2862 | * On 32-bit kernels, VM exits still load the FS and GS bases from the | ||
| 2863 | * VMCS rather than the segment table. KVM uses this helper to figure | ||
| 2864 | * out the current bases to poke them into the VMCS before entry. | ||
| 2865 | */ | ||
| 2866 | static unsigned long segment_base(u16 selector) | ||
| 2867 | { | ||
| 2868 | struct desc_struct *table; | ||
| 2869 | unsigned long v; | ||
| 2870 | |||
| 2871 | if (!(selector & ~SEGMENT_RPL_MASK)) | ||
| 2872 | return 0; | ||
| 2873 | |||
| 2874 | table = get_current_gdt_ro(); | ||
| 2875 | |||
| 2876 | if ((selector & SEGMENT_TI_MASK) == SEGMENT_LDT) { | ||
| 2877 | u16 ldt_selector = kvm_read_ldt(); | ||
| 2878 | |||
| 2879 | if (!(ldt_selector & ~SEGMENT_RPL_MASK)) | ||
| 2880 | return 0; | ||
| 2881 | |||
| 2882 | table = (struct desc_struct *)segment_base(ldt_selector); | ||
| 2883 | } | ||
| 2884 | v = get_desc_base(&table[selector >> 3]); | ||
| 2885 | return v; | ||
| 2886 | } | ||
| 2887 | #endif | ||
| 2888 | |||
| 2889 | static void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) | ||
| 2890 | { | ||
| 2891 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 2892 | struct vmcs_host_state *host_state; | ||
| 2893 | #ifdef CONFIG_X86_64 | ||
| 2894 | int cpu = raw_smp_processor_id(); | ||
| 2895 | #endif | ||
| 2896 | unsigned long fs_base, gs_base; | ||
| 2897 | u16 fs_sel, gs_sel; | ||
| 2898 | int i; | ||
| 2899 | |||
| 2900 | vmx->req_immediate_exit = false; | ||
| 2901 | |||
| 2902 | /* | ||
| 2903 | * Note that guest MSRs to be saved/restored can also be changed | ||
| 2904 | * when guest state is loaded. This happens when guest transitions | ||
| 2905 | * to/from long-mode by setting MSR_EFER.LMA. | ||
| 2906 | */ | ||
| 2907 | if (!vmx->loaded_cpu_state || vmx->guest_msrs_dirty) { | ||
| 2908 | vmx->guest_msrs_dirty = false; | ||
| 2909 | for (i = 0; i < vmx->save_nmsrs; ++i) | ||
| 2910 | kvm_set_shared_msr(vmx->guest_msrs[i].index, | ||
| 2911 | vmx->guest_msrs[i].data, | ||
| 2912 | vmx->guest_msrs[i].mask); | ||
| 2913 | |||
| 2914 | } | ||
| 2915 | |||
| 2916 | if (vmx->loaded_cpu_state) | ||
| 2917 | return; | ||
| 2918 | |||
| 2919 | vmx->loaded_cpu_state = vmx->loaded_vmcs; | ||
| 2920 | host_state = &vmx->loaded_cpu_state->host_state; | ||
| 2921 | |||
| 2922 | /* | ||
| 2923 | * Set host fs and gs selectors. Unfortunately, 22.2.3 does not | ||
| 2924 | * allow segment selectors with cpl > 0 or ti == 1. | ||
| 2925 | */ | ||
| 2926 | host_state->ldt_sel = kvm_read_ldt(); | ||
| 2927 | |||
| 2928 | #ifdef CONFIG_X86_64 | ||
| 2929 | savesegment(ds, host_state->ds_sel); | ||
| 2930 | savesegment(es, host_state->es_sel); | ||
| 2931 | |||
| 2932 | gs_base = cpu_kernelmode_gs_base(cpu); | ||
| 2933 | if (likely(is_64bit_mm(current->mm))) { | ||
| 2934 | save_fsgs_for_kvm(); | ||
| 2935 | fs_sel = current->thread.fsindex; | ||
| 2936 | gs_sel = current->thread.gsindex; | ||
| 2937 | fs_base = current->thread.fsbase; | ||
| 2938 | vmx->msr_host_kernel_gs_base = current->thread.gsbase; | ||
| 2939 | } else { | ||
| 2940 | savesegment(fs, fs_sel); | ||
| 2941 | savesegment(gs, gs_sel); | ||
| 2942 | fs_base = read_msr(MSR_FS_BASE); | ||
| 2943 | vmx->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE); | ||
| 2944 | } | ||
| 2945 | |||
| 2946 | wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); | ||
| 2947 | #else | ||
| 2948 | savesegment(fs, fs_sel); | ||
| 2949 | savesegment(gs, gs_sel); | ||
| 2950 | fs_base = segment_base(fs_sel); | ||
| 2951 | gs_base = segment_base(gs_sel); | ||
| 2952 | #endif | ||
| 2953 | |||
| 2954 | if (unlikely(fs_sel != host_state->fs_sel)) { | ||
| 2955 | if (!(fs_sel & 7)) | ||
| 2956 | vmcs_write16(HOST_FS_SELECTOR, fs_sel); | ||
| 2957 | else | ||
| 2958 | vmcs_write16(HOST_FS_SELECTOR, 0); | ||
| 2959 | host_state->fs_sel = fs_sel; | ||
| 2960 | } | ||
| 2961 | if (unlikely(gs_sel != host_state->gs_sel)) { | ||
| 2962 | if (!(gs_sel & 7)) | ||
| 2963 | vmcs_write16(HOST_GS_SELECTOR, gs_sel); | ||
| 2964 | else | ||
| 2965 | vmcs_write16(HOST_GS_SELECTOR, 0); | ||
| 2966 | host_state->gs_sel = gs_sel; | ||
| 2967 | } | ||
| 2968 | if (unlikely(fs_base != host_state->fs_base)) { | ||
| 2969 | vmcs_writel(HOST_FS_BASE, fs_base); | ||
| 2970 | host_state->fs_base = fs_base; | ||
| 2971 | } | ||
| 2972 | if (unlikely(gs_base != host_state->gs_base)) { | ||
| 2973 | vmcs_writel(HOST_GS_BASE, gs_base); | ||
| 2974 | host_state->gs_base = gs_base; | ||
| 2975 | } | ||
| 2976 | } | ||
| 2977 | |||
| 2978 | static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx) | ||
| 2979 | { | ||
| 2980 | struct vmcs_host_state *host_state; | ||
| 2981 | |||
| 2982 | if (!vmx->loaded_cpu_state) | ||
| 2983 | return; | ||
| 2984 | |||
| 2985 | WARN_ON_ONCE(vmx->loaded_cpu_state != vmx->loaded_vmcs); | ||
| 2986 | host_state = &vmx->loaded_cpu_state->host_state; | ||
| 2987 | |||
| 2988 | ++vmx->vcpu.stat.host_state_reload; | ||
| 2989 | vmx->loaded_cpu_state = NULL; | ||
| 2990 | |||
| 2991 | #ifdef CONFIG_X86_64 | ||
| 2992 | rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); | ||
| 2993 | #endif | ||
| 2994 | if (host_state->ldt_sel || (host_state->gs_sel & 7)) { | ||
| 2995 | kvm_load_ldt(host_state->ldt_sel); | ||
| 2996 | #ifdef CONFIG_X86_64 | ||
| 2997 | load_gs_index(host_state->gs_sel); | ||
| 2998 | #else | ||
| 2999 | loadsegment(gs, host_state->gs_sel); | ||
| 3000 | #endif | ||
| 3001 | } | ||
| 3002 | if (host_state->fs_sel & 7) | ||
| 3003 | loadsegment(fs, host_state->fs_sel); | ||
| 3004 | #ifdef CONFIG_X86_64 | ||
| 3005 | if (unlikely(host_state->ds_sel | host_state->es_sel)) { | ||
| 3006 | loadsegment(ds, host_state->ds_sel); | ||
| 3007 | loadsegment(es, host_state->es_sel); | ||
| 3008 | } | ||
| 3009 | #endif | ||
| 3010 | invalidate_tss_limit(); | ||
| 3011 | #ifdef CONFIG_X86_64 | ||
| 3012 | wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); | ||
| 3013 | #endif | ||
| 3014 | load_fixmap_gdt(raw_smp_processor_id()); | ||
| 3015 | } | ||
| 3016 | |||
| 3017 | #ifdef CONFIG_X86_64 | ||
| 3018 | static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx) | ||
| 3019 | { | ||
| 3020 | preempt_disable(); | ||
| 3021 | if (vmx->loaded_cpu_state) | ||
| 3022 | rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); | ||
| 3023 | preempt_enable(); | ||
| 3024 | return vmx->msr_guest_kernel_gs_base; | ||
| 3025 | } | ||
| 3026 | |||
| 3027 | static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data) | ||
| 3028 | { | ||
| 3029 | preempt_disable(); | ||
| 3030 | if (vmx->loaded_cpu_state) | ||
| 3031 | wrmsrl(MSR_KERNEL_GS_BASE, data); | ||
| 3032 | preempt_enable(); | ||
| 3033 | vmx->msr_guest_kernel_gs_base = data; | ||
| 3034 | } | ||
| 3035 | #endif | ||
| 3036 | |||
| 3037 | static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) | ||
| 3038 | { | ||
| 3039 | struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); | ||
| 3040 | struct pi_desc old, new; | ||
| 3041 | unsigned int dest; | ||
| 3042 | |||
| 3043 | /* | ||
| 3044 | * In case of hot-plug or hot-unplug, we may have to undo | ||
| 3045 | * vmx_vcpu_pi_put even if there is no assigned device. And we | ||
| 3046 | * always keep PI.NDST up to date for simplicity: it makes the | ||
| 3047 | * code easier, and CPU migration is not a fast path. | ||
| 3048 | */ | ||
| 3049 | if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu) | ||
| 3050 | return; | ||
| 3051 | |||
| 3052 | /* | ||
| 3053 | * First handle the simple case where no cmpxchg is necessary; just | ||
| 3054 | * allow posting non-urgent interrupts. | ||
| 3055 | * | ||
| 3056 | * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change | ||
| 3057 | * PI.NDST: pi_post_block will do it for us and the wakeup_handler | ||
| 3058 | * expects the VCPU to be on the blocked_vcpu_list that matches | ||
| 3059 | * PI.NDST. | ||
| 3060 | */ | ||
| 3061 | if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR || | ||
| 3062 | vcpu->cpu == cpu) { | ||
| 3063 | pi_clear_sn(pi_desc); | ||
| 3064 | return; | ||
| 3065 | } | ||
| 3066 | |||
| 3067 | /* The full case. */ | ||
| 3068 | do { | ||
| 3069 | old.control = new.control = pi_desc->control; | ||
| 3070 | |||
| 3071 | dest = cpu_physical_id(cpu); | ||
| 3072 | |||
| 3073 | if (x2apic_enabled()) | ||
| 3074 | new.ndst = dest; | ||
| 3075 | else | ||
| 3076 | new.ndst = (dest << 8) & 0xFF00; | ||
| 3077 | |||
| 3078 | new.sn = 0; | ||
| 3079 | } while (cmpxchg64(&pi_desc->control, old.control, | ||
| 3080 | new.control) != old.control); | ||
| 3081 | } | ||
| 3082 | |||
| 3083 | static void decache_tsc_multiplier(struct vcpu_vmx *vmx) | ||
| 3084 | { | ||
| 3085 | vmx->current_tsc_ratio = vmx->vcpu.arch.tsc_scaling_ratio; | ||
| 3086 | vmcs_write64(TSC_MULTIPLIER, vmx->current_tsc_ratio); | ||
| 3087 | } | ||
| 3088 | |||
| 3089 | /* | ||
| 3090 | * Switches to specified vcpu, until a matching vcpu_put(), but assumes | ||
| 3091 | * vcpu mutex is already taken. | ||
| 3092 | */ | ||
| 3093 | static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) | ||
| 3094 | { | ||
| 3095 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 3096 | bool already_loaded = vmx->loaded_vmcs->cpu == cpu; | ||
| 3097 | |||
| 3098 | if (!already_loaded) { | ||
| 3099 | loaded_vmcs_clear(vmx->loaded_vmcs); | ||
| 3100 | local_irq_disable(); | ||
| 3101 | crash_disable_local_vmclear(cpu); | ||
| 3102 | |||
| 3103 | /* | ||
| 3104 | * Read loaded_vmcs->cpu should be before fetching | ||
| 3105 | * loaded_vmcs->loaded_vmcss_on_cpu_link. | ||
| 3106 | * See the comments in __loaded_vmcs_clear(). | ||
| 3107 | */ | ||
| 3108 | smp_rmb(); | ||
| 3109 | |||
| 3110 | list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link, | ||
| 3111 | &per_cpu(loaded_vmcss_on_cpu, cpu)); | ||
| 3112 | crash_enable_local_vmclear(cpu); | ||
| 3113 | local_irq_enable(); | ||
| 3114 | } | ||
| 3115 | |||
| 3116 | if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) { | ||
| 3117 | per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs; | ||
| 3118 | vmcs_load(vmx->loaded_vmcs->vmcs); | ||
| 3119 | indirect_branch_prediction_barrier(); | ||
| 3120 | } | ||
| 3121 | |||
| 3122 | if (!already_loaded) { | ||
| 3123 | void *gdt = get_current_gdt_ro(); | ||
| 3124 | unsigned long sysenter_esp; | ||
| 3125 | |||
| 3126 | kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); | ||
| 3127 | |||
| 3128 | /* | ||
| 3129 | * Linux uses per-cpu TSS and GDT, so set these when switching | ||
| 3130 | * processors. See 22.2.4. | ||
| 3131 | */ | ||
| 3132 | vmcs_writel(HOST_TR_BASE, | ||
| 3133 | (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss); | ||
| 3134 | vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */ | ||
| 3135 | |||
| 3136 | /* | ||
| 3137 | * VM exits change the host TR limit to 0x67 after a VM | ||
| 3138 | * exit. This is okay, since 0x67 covers everything except | ||
| 3139 | * the IO bitmap and have have code to handle the IO bitmap | ||
| 3140 | * being lost after a VM exit. | ||
| 3141 | */ | ||
| 3142 | BUILD_BUG_ON(IO_BITMAP_OFFSET - 1 != 0x67); | ||
| 3143 | |||
| 3144 | rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); | ||
| 3145 | vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */ | ||
| 3146 | |||
| 3147 | vmx->loaded_vmcs->cpu = cpu; | ||
| 3148 | } | ||
| 3149 | |||
| 3150 | /* Setup TSC multiplier */ | ||
| 3151 | if (kvm_has_tsc_control && | ||
| 3152 | vmx->current_tsc_ratio != vcpu->arch.tsc_scaling_ratio) | ||
| 3153 | decache_tsc_multiplier(vmx); | ||
| 3154 | |||
| 3155 | vmx_vcpu_pi_load(vcpu, cpu); | ||
| 3156 | vmx->host_pkru = read_pkru(); | ||
| 3157 | vmx->host_debugctlmsr = get_debugctlmsr(); | ||
| 3158 | } | ||
| 3159 | |||
| 3160 | static void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu) | ||
| 3161 | { | ||
| 3162 | struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); | ||
| 3163 | |||
| 3164 | if (!kvm_arch_has_assigned_device(vcpu->kvm) || | ||
| 3165 | !irq_remapping_cap(IRQ_POSTING_CAP) || | ||
| 3166 | !kvm_vcpu_apicv_active(vcpu)) | ||
| 3167 | return; | ||
| 3168 | |||
| 3169 | /* Set SN when the vCPU is preempted */ | ||
| 3170 | if (vcpu->preempted) | ||
| 3171 | pi_set_sn(pi_desc); | ||
| 3172 | } | ||
| 3173 | |||
| 3174 | static void vmx_vcpu_put(struct kvm_vcpu *vcpu) | ||
| 3175 | { | ||
| 3176 | vmx_vcpu_pi_put(vcpu); | ||
| 3177 | |||
| 3178 | vmx_prepare_switch_to_host(to_vmx(vcpu)); | ||
| 3179 | } | ||
| 3180 | |||
| 3181 | static bool emulation_required(struct kvm_vcpu *vcpu) | ||
| 3182 | { | ||
| 3183 | return emulate_invalid_guest_state && !guest_state_valid(vcpu); | ||
| 3184 | } | ||
| 3185 | |||
| 3186 | static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu); | ||
| 3187 | |||
| 3188 | /* | ||
| 3189 | * Return the cr0 value that a nested guest would read. This is a combination | ||
| 3190 | * of the real cr0 used to run the guest (guest_cr0), and the bits shadowed by | ||
| 3191 | * its hypervisor (cr0_read_shadow). | ||
| 3192 | */ | ||
| 3193 | static inline unsigned long nested_read_cr0(struct vmcs12 *fields) | ||
| 3194 | { | ||
| 3195 | return (fields->guest_cr0 & ~fields->cr0_guest_host_mask) | | ||
| 3196 | (fields->cr0_read_shadow & fields->cr0_guest_host_mask); | ||
| 3197 | } | ||
| 3198 | static inline unsigned long nested_read_cr4(struct vmcs12 *fields) | ||
| 3199 | { | ||
| 3200 | return (fields->guest_cr4 & ~fields->cr4_guest_host_mask) | | ||
| 3201 | (fields->cr4_read_shadow & fields->cr4_guest_host_mask); | ||
| 3202 | } | ||
| 3203 | |||
| 3204 | static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) | ||
| 3205 | { | ||
| 3206 | unsigned long rflags, save_rflags; | ||
| 3207 | |||
| 3208 | if (!test_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail)) { | ||
| 3209 | __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail); | ||
| 3210 | rflags = vmcs_readl(GUEST_RFLAGS); | ||
| 3211 | if (to_vmx(vcpu)->rmode.vm86_active) { | ||
| 3212 | rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS; | ||
| 3213 | save_rflags = to_vmx(vcpu)->rmode.save_rflags; | ||
| 3214 | rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS; | ||
| 3215 | } | ||
| 3216 | to_vmx(vcpu)->rflags = rflags; | ||
| 3217 | } | ||
| 3218 | return to_vmx(vcpu)->rflags; | ||
| 3219 | } | ||
| 3220 | |||
| 3221 | static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) | ||
| 3222 | { | ||
| 3223 | unsigned long old_rflags = vmx_get_rflags(vcpu); | ||
| 3224 | |||
| 3225 | __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail); | ||
| 3226 | to_vmx(vcpu)->rflags = rflags; | ||
| 3227 | if (to_vmx(vcpu)->rmode.vm86_active) { | ||
| 3228 | to_vmx(vcpu)->rmode.save_rflags = rflags; | ||
| 3229 | rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; | ||
| 3230 | } | ||
| 3231 | vmcs_writel(GUEST_RFLAGS, rflags); | ||
| 3232 | |||
| 3233 | if ((old_rflags ^ to_vmx(vcpu)->rflags) & X86_EFLAGS_VM) | ||
| 3234 | to_vmx(vcpu)->emulation_required = emulation_required(vcpu); | ||
| 3235 | } | ||
| 3236 | |||
| 3237 | static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu) | ||
| 3238 | { | ||
| 3239 | u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); | ||
| 3240 | int ret = 0; | ||
| 3241 | |||
| 3242 | if (interruptibility & GUEST_INTR_STATE_STI) | ||
| 3243 | ret |= KVM_X86_SHADOW_INT_STI; | ||
| 3244 | if (interruptibility & GUEST_INTR_STATE_MOV_SS) | ||
| 3245 | ret |= KVM_X86_SHADOW_INT_MOV_SS; | ||
| 3246 | |||
| 3247 | return ret; | ||
| 3248 | } | ||
| 3249 | |||
| 3250 | static void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) | ||
| 3251 | { | ||
| 3252 | u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); | ||
| 3253 | u32 interruptibility = interruptibility_old; | ||
| 3254 | |||
| 3255 | interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS); | ||
| 3256 | |||
| 3257 | if (mask & KVM_X86_SHADOW_INT_MOV_SS) | ||
| 3258 | interruptibility |= GUEST_INTR_STATE_MOV_SS; | ||
| 3259 | else if (mask & KVM_X86_SHADOW_INT_STI) | ||
| 3260 | interruptibility |= GUEST_INTR_STATE_STI; | ||
| 3261 | |||
| 3262 | if ((interruptibility != interruptibility_old)) | ||
| 3263 | vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility); | ||
| 3264 | } | ||
| 3265 | |||
| 3266 | static void skip_emulated_instruction(struct kvm_vcpu *vcpu) | ||
| 3267 | { | ||
| 3268 | unsigned long rip; | ||
| 3269 | |||
| 3270 | rip = kvm_rip_read(vcpu); | ||
| 3271 | rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN); | ||
| 3272 | kvm_rip_write(vcpu, rip); | ||
| 3273 | |||
| 3274 | /* skipping an emulated instruction also counts */ | ||
| 3275 | vmx_set_interrupt_shadow(vcpu, 0); | ||
| 3276 | } | ||
| 3277 | |||
| 3278 | static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu, | ||
| 3279 | unsigned long exit_qual) | ||
| 3280 | { | ||
| 3281 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); | ||
| 3282 | unsigned int nr = vcpu->arch.exception.nr; | ||
| 3283 | u32 intr_info = nr | INTR_INFO_VALID_MASK; | ||
| 3284 | |||
| 3285 | if (vcpu->arch.exception.has_error_code) { | ||
| 3286 | vmcs12->vm_exit_intr_error_code = vcpu->arch.exception.error_code; | ||
| 3287 | intr_info |= INTR_INFO_DELIVER_CODE_MASK; | ||
| 3288 | } | ||
| 3289 | |||
| 3290 | if (kvm_exception_is_soft(nr)) | ||
| 3291 | intr_info |= INTR_TYPE_SOFT_EXCEPTION; | ||
| 3292 | else | ||
| 3293 | intr_info |= INTR_TYPE_HARD_EXCEPTION; | ||
| 3294 | |||
| 3295 | if (!(vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) && | ||
| 3296 | vmx_get_nmi_mask(vcpu)) | ||
| 3297 | intr_info |= INTR_INFO_UNBLOCK_NMI; | ||
| 3298 | |||
| 3299 | nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual); | ||
| 3300 | } | ||
| 3301 | |||
| 3302 | /* | ||
| 3303 | * KVM wants to inject page-faults which it got to the guest. This function | ||
| 3304 | * checks whether in a nested guest, we need to inject them to L1 or L2. | ||
| 3305 | */ | ||
| 3306 | static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit_qual) | ||
| 3307 | { | ||
| 3308 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); | ||
| 3309 | unsigned int nr = vcpu->arch.exception.nr; | ||
| 3310 | bool has_payload = vcpu->arch.exception.has_payload; | ||
| 3311 | unsigned long payload = vcpu->arch.exception.payload; | ||
| 3312 | |||
| 3313 | if (nr == PF_VECTOR) { | ||
| 3314 | if (vcpu->arch.exception.nested_apf) { | ||
| 3315 | *exit_qual = vcpu->arch.apf.nested_apf_token; | ||
| 3316 | return 1; | ||
| 3317 | } | ||
| 3318 | if (nested_vmx_is_page_fault_vmexit(vmcs12, | ||
| 3319 | vcpu->arch.exception.error_code)) { | ||
| 3320 | *exit_qual = has_payload ? payload : vcpu->arch.cr2; | ||
| 3321 | return 1; | ||
| 3322 | } | ||
| 3323 | } else if (vmcs12->exception_bitmap & (1u << nr)) { | ||
| 3324 | if (nr == DB_VECTOR) { | ||
| 3325 | if (!has_payload) { | ||
| 3326 | payload = vcpu->arch.dr6; | ||
| 3327 | payload &= ~(DR6_FIXED_1 | DR6_BT); | ||
| 3328 | payload ^= DR6_RTM; | ||
| 3329 | } | ||
| 3330 | *exit_qual = payload; | ||
| 3331 | } else | ||
| 3332 | *exit_qual = 0; | ||
| 3333 | return 1; | ||
| 3334 | } | ||
| 3335 | |||
| 3336 | return 0; | ||
| 3337 | } | ||
| 3338 | |||
| 3339 | static void vmx_clear_hlt(struct kvm_vcpu *vcpu) | ||
| 3340 | { | ||
| 3341 | /* | ||
| 3342 | * Ensure that we clear the HLT state in the VMCS. We don't need to | ||
| 3343 | * explicitly skip the instruction because if the HLT state is set, | ||
| 3344 | * then the instruction is already executing and RIP has already been | ||
| 3345 | * advanced. | ||
| 3346 | */ | ||
| 3347 | if (kvm_hlt_in_guest(vcpu->kvm) && | ||
| 3348 | vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT) | ||
| 3349 | vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); | ||
| 3350 | } | ||
| 3351 | |||
| 3352 | static void vmx_queue_exception(struct kvm_vcpu *vcpu) | ||
| 3353 | { | ||
| 3354 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 3355 | unsigned nr = vcpu->arch.exception.nr; | ||
| 3356 | bool has_error_code = vcpu->arch.exception.has_error_code; | ||
| 3357 | u32 error_code = vcpu->arch.exception.error_code; | ||
| 3358 | u32 intr_info = nr | INTR_INFO_VALID_MASK; | ||
| 3359 | |||
| 3360 | kvm_deliver_exception_payload(vcpu); | ||
| 3361 | |||
| 3362 | if (has_error_code) { | ||
| 3363 | vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code); | ||
| 3364 | intr_info |= INTR_INFO_DELIVER_CODE_MASK; | ||
| 3365 | } | ||
| 3366 | |||
| 3367 | if (vmx->rmode.vm86_active) { | ||
| 3368 | int inc_eip = 0; | ||
| 3369 | if (kvm_exception_is_soft(nr)) | ||
| 3370 | inc_eip = vcpu->arch.event_exit_inst_len; | ||
| 3371 | if (kvm_inject_realmode_interrupt(vcpu, nr, inc_eip) != EMULATE_DONE) | ||
| 3372 | kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); | ||
| 3373 | return; | ||
| 3374 | } | ||
| 3375 | |||
| 3376 | WARN_ON_ONCE(vmx->emulation_required); | ||
| 3377 | |||
| 3378 | if (kvm_exception_is_soft(nr)) { | ||
| 3379 | vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, | ||
| 3380 | vmx->vcpu.arch.event_exit_inst_len); | ||
| 3381 | intr_info |= INTR_TYPE_SOFT_EXCEPTION; | ||
| 3382 | } else | ||
| 3383 | intr_info |= INTR_TYPE_HARD_EXCEPTION; | ||
| 3384 | |||
| 3385 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); | ||
| 3386 | |||
| 3387 | vmx_clear_hlt(vcpu); | ||
| 3388 | } | ||
| 3389 | |||
| 3390 | static bool vmx_rdtscp_supported(void) | ||
| 3391 | { | ||
| 3392 | return cpu_has_vmx_rdtscp(); | ||
| 3393 | } | ||
| 3394 | |||
| 3395 | static bool vmx_invpcid_supported(void) | ||
| 3396 | { | ||
| 3397 | return cpu_has_vmx_invpcid(); | ||
| 3398 | } | ||
| 3399 | |||
| 3400 | /* | ||
| 3401 | * Swap MSR entry in host/guest MSR entry array. | ||
| 3402 | */ | ||
| 3403 | static void move_msr_up(struct vcpu_vmx *vmx, int from, int to) | ||
| 3404 | { | ||
| 3405 | struct shared_msr_entry tmp; | ||
| 3406 | |||
| 3407 | tmp = vmx->guest_msrs[to]; | ||
| 3408 | vmx->guest_msrs[to] = vmx->guest_msrs[from]; | ||
| 3409 | vmx->guest_msrs[from] = tmp; | ||
| 3410 | } | ||
| 3411 | |||
| 3412 | /* | ||
| 3413 | * Set up the vmcs to automatically save and restore system | ||
| 3414 | * msrs. Don't touch the 64-bit msrs if the guest is in legacy | ||
| 3415 | * mode, as fiddling with msrs is very expensive. | ||
| 3416 | */ | ||
| 3417 | static void setup_msrs(struct vcpu_vmx *vmx) | ||
| 3418 | { | ||
| 3419 | int save_nmsrs, index; | ||
| 3420 | |||
| 3421 | save_nmsrs = 0; | ||
| 3422 | #ifdef CONFIG_X86_64 | ||
| 3423 | if (is_long_mode(&vmx->vcpu)) { | ||
| 3424 | index = __find_msr_index(vmx, MSR_SYSCALL_MASK); | ||
| 3425 | if (index >= 0) | ||
| 3426 | move_msr_up(vmx, index, save_nmsrs++); | ||
| 3427 | index = __find_msr_index(vmx, MSR_LSTAR); | ||
| 3428 | if (index >= 0) | ||
| 3429 | move_msr_up(vmx, index, save_nmsrs++); | ||
| 3430 | index = __find_msr_index(vmx, MSR_CSTAR); | ||
| 3431 | if (index >= 0) | ||
| 3432 | move_msr_up(vmx, index, save_nmsrs++); | ||
| 3433 | index = __find_msr_index(vmx, MSR_TSC_AUX); | ||
| 3434 | if (index >= 0 && guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP)) | ||
| 3435 | move_msr_up(vmx, index, save_nmsrs++); | ||
| 3436 | /* | ||
| 3437 | * MSR_STAR is only needed on long mode guests, and only | ||
| 3438 | * if efer.sce is enabled. | ||
| 3439 | */ | ||
| 3440 | index = __find_msr_index(vmx, MSR_STAR); | ||
| 3441 | if ((index >= 0) && (vmx->vcpu.arch.efer & EFER_SCE)) | ||
| 3442 | move_msr_up(vmx, index, save_nmsrs++); | ||
| 3443 | } | ||
| 3444 | #endif | ||
| 3445 | index = __find_msr_index(vmx, MSR_EFER); | ||
| 3446 | if (index >= 0 && update_transition_efer(vmx, index)) | ||
| 3447 | move_msr_up(vmx, index, save_nmsrs++); | ||
| 3448 | |||
| 3449 | vmx->save_nmsrs = save_nmsrs; | ||
| 3450 | vmx->guest_msrs_dirty = true; | ||
| 3451 | |||
| 3452 | if (cpu_has_vmx_msr_bitmap()) | ||
| 3453 | vmx_update_msr_bitmap(&vmx->vcpu); | ||
| 3454 | } | ||
| 3455 | |||
| 3456 | static u64 vmx_read_l1_tsc_offset(struct kvm_vcpu *vcpu) | ||
| 3457 | { | ||
| 3458 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); | ||
| 3459 | |||
| 3460 | if (is_guest_mode(vcpu) && | ||
| 3461 | (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)) | ||
| 3462 | return vcpu->arch.tsc_offset - vmcs12->tsc_offset; | ||
| 3463 | |||
| 3464 | return vcpu->arch.tsc_offset; | ||
| 3465 | } | ||
| 3466 | |||
| 3467 | static u64 vmx_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) | ||
| 3468 | { | ||
| 3469 | u64 active_offset = offset; | ||
| 3470 | if (is_guest_mode(vcpu)) { | ||
| 3471 | /* | ||
| 3472 | * We're here if L1 chose not to trap WRMSR to TSC. According | ||
| 3473 | * to the spec, this should set L1's TSC; The offset that L1 | ||
| 3474 | * set for L2 remains unchanged, and still needs to be added | ||
| 3475 | * to the newly set TSC to get L2's TSC. | ||
| 3476 | */ | ||
| 3477 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); | ||
| 3478 | if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETING)) | ||
| 3479 | active_offset += vmcs12->tsc_offset; | ||
| 3480 | } else { | ||
| 3481 | trace_kvm_write_tsc_offset(vcpu->vcpu_id, | ||
| 3482 | vmcs_read64(TSC_OFFSET), offset); | ||
| 3483 | } | ||
| 3484 | |||
| 3485 | vmcs_write64(TSC_OFFSET, active_offset); | ||
| 3486 | return active_offset; | ||
| 3487 | } | ||
| 3488 | |||
| 3489 | /* | ||
| 3490 | * nested_vmx_allowed() checks whether a guest should be allowed to use VMX | ||
| 3491 | * instructions and MSRs (i.e., nested VMX). Nested VMX is disabled for | ||
| 3492 | * all guests if the "nested" module option is off, and can also be disabled | ||
| 3493 | * for a single guest by disabling its VMX cpuid bit. | ||
| 3494 | */ | ||
| 3495 | static inline bool nested_vmx_allowed(struct kvm_vcpu *vcpu) | ||
| 3496 | { | ||
| 3497 | return nested && guest_cpuid_has(vcpu, X86_FEATURE_VMX); | ||
| 3498 | } | ||
| 3499 | |||
| 3500 | /* | ||
| 3501 | * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be | ||
| 3502 | * returned for the various VMX controls MSRs when nested VMX is enabled. | ||
| 3503 | * The same values should also be used to verify that vmcs12 control fields are | ||
| 3504 | * valid during nested entry from L1 to L2. | ||
| 3505 | * Each of these control msrs has a low and high 32-bit half: A low bit is on | ||
| 3506 | * if the corresponding bit in the (32-bit) control field *must* be on, and a | ||
| 3507 | * bit in the high half is on if the corresponding bit in the control field | ||
| 3508 | * may be on. See also vmx_control_verify(). | ||
| 3509 | */ | ||
| 3510 | static void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, bool apicv) | ||
| 3511 | { | ||
| 3512 | if (!nested) { | ||
| 3513 | memset(msrs, 0, sizeof(*msrs)); | ||
| 3514 | return; | ||
| 3515 | } | ||
| 3516 | |||
| 3517 | /* | ||
| 3518 | * Note that as a general rule, the high half of the MSRs (bits in | ||
| 3519 | * the control fields which may be 1) should be initialized by the | ||
| 3520 | * intersection of the underlying hardware's MSR (i.e., features which | ||
| 3521 | * can be supported) and the list of features we want to expose - | ||
| 3522 | * because they are known to be properly supported in our code. | ||
| 3523 | * Also, usually, the low half of the MSRs (bits which must be 1) can | ||
| 3524 | * be set to 0, meaning that L1 may turn off any of these bits. The | ||
| 3525 | * reason is that if one of these bits is necessary, it will appear | ||
| 3526 | * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control | ||
| 3527 | * fields of vmcs01 and vmcs02, will turn these bits off - and | ||
| 3528 | * nested_vmx_exit_reflected() will not pass related exits to L1. | ||
| 3529 | * These rules have exceptions below. | ||
| 3530 | */ | ||
| 3531 | |||
| 3532 | /* pin-based controls */ | ||
| 3533 | rdmsr(MSR_IA32_VMX_PINBASED_CTLS, | ||
| 3534 | msrs->pinbased_ctls_low, | ||
| 3535 | msrs->pinbased_ctls_high); | ||
| 3536 | msrs->pinbased_ctls_low |= | ||
| 3537 | PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; | ||
| 3538 | msrs->pinbased_ctls_high &= | ||
| 3539 | PIN_BASED_EXT_INTR_MASK | | ||
| 3540 | PIN_BASED_NMI_EXITING | | ||
| 3541 | PIN_BASED_VIRTUAL_NMIS | | ||
| 3542 | (apicv ? PIN_BASED_POSTED_INTR : 0); | ||
| 3543 | msrs->pinbased_ctls_high |= | ||
| 3544 | PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | | ||
| 3545 | PIN_BASED_VMX_PREEMPTION_TIMER; | ||
| 3546 | |||
| 3547 | /* exit controls */ | ||
| 3548 | rdmsr(MSR_IA32_VMX_EXIT_CTLS, | ||
| 3549 | msrs->exit_ctls_low, | ||
| 3550 | msrs->exit_ctls_high); | ||
| 3551 | msrs->exit_ctls_low = | ||
| 3552 | VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; | ||
| 3553 | |||
| 3554 | msrs->exit_ctls_high &= | ||
| 3555 | #ifdef CONFIG_X86_64 | ||
| 3556 | VM_EXIT_HOST_ADDR_SPACE_SIZE | | ||
| 3557 | #endif | ||
| 3558 | VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT; | ||
| 3559 | msrs->exit_ctls_high |= | ||
| 3560 | VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | | ||
| 3561 | VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER | | ||
| 3562 | VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT; | ||
| 3563 | |||
| 3564 | /* We support free control of debug control saving. */ | ||
| 3565 | msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS; | ||
| 3566 | |||
| 3567 | /* entry controls */ | ||
| 3568 | rdmsr(MSR_IA32_VMX_ENTRY_CTLS, | ||
| 3569 | msrs->entry_ctls_low, | ||
| 3570 | msrs->entry_ctls_high); | ||
| 3571 | msrs->entry_ctls_low = | ||
| 3572 | VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; | ||
| 3573 | msrs->entry_ctls_high &= | ||
| 3574 | #ifdef CONFIG_X86_64 | ||
| 3575 | VM_ENTRY_IA32E_MODE | | ||
| 3576 | #endif | ||
| 3577 | VM_ENTRY_LOAD_IA32_PAT; | ||
| 3578 | msrs->entry_ctls_high |= | ||
| 3579 | (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER); | ||
| 3580 | |||
| 3581 | /* We support free control of debug control loading. */ | ||
| 3582 | msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS; | ||
| 3583 | |||
| 3584 | /* cpu-based controls */ | ||
| 3585 | rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, | ||
| 3586 | msrs->procbased_ctls_low, | ||
| 3587 | msrs->procbased_ctls_high); | ||
| 3588 | msrs->procbased_ctls_low = | ||
| 3589 | CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; | ||
| 3590 | msrs->procbased_ctls_high &= | ||
| 3591 | CPU_BASED_VIRTUAL_INTR_PENDING | | ||
| 3592 | CPU_BASED_VIRTUAL_NMI_PENDING | CPU_BASED_USE_TSC_OFFSETING | | ||
| 3593 | CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING | | ||
| 3594 | CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING | | ||
| 3595 | CPU_BASED_CR3_STORE_EXITING | | ||
| 3596 | #ifdef CONFIG_X86_64 | ||
| 3597 | CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING | | ||
| 3598 | #endif | ||
| 3599 | CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING | | ||
| 3600 | CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_TRAP_FLAG | | ||
| 3601 | CPU_BASED_MONITOR_EXITING | CPU_BASED_RDPMC_EXITING | | ||
| 3602 | CPU_BASED_RDTSC_EXITING | CPU_BASED_PAUSE_EXITING | | ||
| 3603 | CPU_BASED_TPR_SHADOW | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; | ||
| 3604 | /* | ||
| 3605 | * We can allow some features even when not supported by the | ||
| 3606 | * hardware. For example, L1 can specify an MSR bitmap - and we | ||
| 3607 | * can use it to avoid exits to L1 - even when L0 runs L2 | ||
| 3608 | * without MSR bitmaps. | ||
| 3609 | */ | ||
| 3610 | msrs->procbased_ctls_high |= | ||
| 3611 | CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR | | ||
| 3612 | CPU_BASED_USE_MSR_BITMAPS; | ||
| 3613 | |||
| 3614 | /* We support free control of CR3 access interception. */ | ||
| 3615 | msrs->procbased_ctls_low &= | ||
| 3616 | ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING); | ||
| 3617 | |||
| 3618 | /* | ||
| 3619 | * secondary cpu-based controls. Do not include those that | ||
| 3620 | * depend on CPUID bits, they are added later by vmx_cpuid_update. | ||
| 3621 | */ | ||
| 3622 | rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2, | ||
| 3623 | msrs->secondary_ctls_low, | ||
| 3624 | msrs->secondary_ctls_high); | ||
| 3625 | msrs->secondary_ctls_low = 0; | ||
| 3626 | msrs->secondary_ctls_high &= | ||
| 3627 | SECONDARY_EXEC_DESC | | ||
| 3628 | SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | | ||
| 3629 | SECONDARY_EXEC_APIC_REGISTER_VIRT | | ||
| 3630 | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | | ||
| 3631 | SECONDARY_EXEC_WBINVD_EXITING; | ||
| 3632 | |||
| 3633 | /* | ||
| 3634 | * We can emulate "VMCS shadowing," even if the hardware | ||
| 3635 | * doesn't support it. | ||
| 3636 | */ | ||
| 3637 | msrs->secondary_ctls_high |= | ||
| 3638 | SECONDARY_EXEC_SHADOW_VMCS; | ||
| 3639 | |||
| 3640 | if (enable_ept) { | ||
| 3641 | /* nested EPT: emulate EPT also to L1 */ | ||
| 3642 | msrs->secondary_ctls_high |= | ||
| 3643 | SECONDARY_EXEC_ENABLE_EPT; | ||
| 3644 | msrs->ept_caps = VMX_EPT_PAGE_WALK_4_BIT | | ||
| 3645 | VMX_EPTP_WB_BIT | VMX_EPT_INVEPT_BIT; | ||
| 3646 | if (cpu_has_vmx_ept_execute_only()) | ||
| 3647 | msrs->ept_caps |= | ||
| 3648 | VMX_EPT_EXECUTE_ONLY_BIT; | ||
| 3649 | msrs->ept_caps &= vmx_capability.ept; | ||
| 3650 | msrs->ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT | | ||
| 3651 | VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT | | ||
| 3652 | VMX_EPT_1GB_PAGE_BIT; | ||
| 3653 | if (enable_ept_ad_bits) { | ||
| 3654 | msrs->secondary_ctls_high |= | ||
| 3655 | SECONDARY_EXEC_ENABLE_PML; | ||
| 3656 | msrs->ept_caps |= VMX_EPT_AD_BIT; | ||
| 3657 | } | ||
| 3658 | } | ||
| 3659 | |||
| 3660 | if (cpu_has_vmx_vmfunc()) { | ||
| 3661 | msrs->secondary_ctls_high |= | ||
| 3662 | SECONDARY_EXEC_ENABLE_VMFUNC; | ||
| 3663 | /* | ||
| 3664 | * Advertise EPTP switching unconditionally | ||
| 3665 | * since we emulate it | ||
| 3666 | */ | ||
| 3667 | if (enable_ept) | ||
| 3668 | msrs->vmfunc_controls = | ||
| 3669 | VMX_VMFUNC_EPTP_SWITCHING; | ||
| 3670 | } | ||
| 3671 | |||
| 3672 | /* | ||
| 3673 | * Old versions of KVM use the single-context version without | ||
| 3674 | * checking for support, so declare that it is supported even | ||
| 3675 | * though it is treated as global context. The alternative is | ||
| 3676 | * not failing the single-context invvpid, and it is worse. | ||
| 3677 | */ | ||
| 3678 | if (enable_vpid) { | ||
| 3679 | msrs->secondary_ctls_high |= | ||
| 3680 | SECONDARY_EXEC_ENABLE_VPID; | ||
| 3681 | msrs->vpid_caps = VMX_VPID_INVVPID_BIT | | ||
| 3682 | VMX_VPID_EXTENT_SUPPORTED_MASK; | ||
| 3683 | } | ||
| 3684 | |||
| 3685 | if (enable_unrestricted_guest) | ||
| 3686 | msrs->secondary_ctls_high |= | ||
| 3687 | SECONDARY_EXEC_UNRESTRICTED_GUEST; | ||
| 3688 | |||
| 3689 | if (flexpriority_enabled) | ||
| 3690 | msrs->secondary_ctls_high |= | ||
| 3691 | SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; | ||
| 3692 | |||
| 3693 | /* miscellaneous data */ | ||
| 3694 | rdmsr(MSR_IA32_VMX_MISC, | ||
| 3695 | msrs->misc_low, | ||
| 3696 | msrs->misc_high); | ||
| 3697 | msrs->misc_low &= VMX_MISC_SAVE_EFER_LMA; | ||
| 3698 | msrs->misc_low |= | ||
| 3699 | MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS | | ||
| 3700 | VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE | | ||
| 3701 | VMX_MISC_ACTIVITY_HLT; | ||
| 3702 | msrs->misc_high = 0; | ||
| 3703 | |||
| 3704 | /* | ||
| 3705 | * This MSR reports some information about VMX support. We | ||
| 3706 | * should return information about the VMX we emulate for the | ||
| 3707 | * guest, and the VMCS structure we give it - not about the | ||
| 3708 | * VMX support of the underlying hardware. | ||
| 3709 | */ | ||
| 3710 | msrs->basic = | ||
| 3711 | VMCS12_REVISION | | ||
| 3712 | VMX_BASIC_TRUE_CTLS | | ||
| 3713 | ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) | | ||
| 3714 | (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT); | ||
| 3715 | |||
| 3716 | if (cpu_has_vmx_basic_inout()) | ||
| 3717 | msrs->basic |= VMX_BASIC_INOUT; | ||
| 3718 | |||
| 3719 | /* | ||
| 3720 | * These MSRs specify bits which the guest must keep fixed on | ||
| 3721 | * while L1 is in VMXON mode (in L1's root mode, or running an L2). | ||
| 3722 | * We picked the standard core2 setting. | ||
| 3723 | */ | ||
| 3724 | #define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE) | ||
| 3725 | #define VMXON_CR4_ALWAYSON X86_CR4_VMXE | ||
| 3726 | msrs->cr0_fixed0 = VMXON_CR0_ALWAYSON; | ||
| 3727 | msrs->cr4_fixed0 = VMXON_CR4_ALWAYSON; | ||
| 3728 | |||
| 3729 | /* These MSRs specify bits which the guest must keep fixed off. */ | ||
| 3730 | rdmsrl(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1); | ||
| 3731 | rdmsrl(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1); | ||
| 3732 | |||
| 3733 | /* highest index: VMX_PREEMPTION_TIMER_VALUE */ | ||
| 3734 | msrs->vmcs_enum = VMCS12_MAX_FIELD_INDEX << 1; | ||
| 3735 | } | ||
| 3736 | |||
| 3737 | /* | ||
| 3738 | * if fixed0[i] == 1: val[i] must be 1 | ||
| 3739 | * if fixed1[i] == 0: val[i] must be 0 | ||
| 3740 | */ | ||
| 3741 | static inline bool fixed_bits_valid(u64 val, u64 fixed0, u64 fixed1) | ||
| 3742 | { | ||
| 3743 | return ((val & fixed1) | fixed0) == val; | ||
| 3744 | } | ||
| 3745 | |||
| 3746 | static inline bool vmx_control_verify(u32 control, u32 low, u32 high) | ||
| 3747 | { | ||
| 3748 | return fixed_bits_valid(control, low, high); | ||
| 3749 | } | ||
| 3750 | |||
| 3751 | static inline u64 vmx_control_msr(u32 low, u32 high) | ||
| 3752 | { | ||
| 3753 | return low | ((u64)high << 32); | ||
| 3754 | } | ||
| 3755 | |||
| 3756 | static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask) | ||
| 3757 | { | ||
| 3758 | superset &= mask; | ||
| 3759 | subset &= mask; | ||
| 3760 | |||
| 3761 | return (superset | subset) == superset; | ||
| 3762 | } | ||
| 3763 | |||
| 3764 | static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data) | ||
| 3765 | { | ||
| 3766 | const u64 feature_and_reserved = | ||
| 3767 | /* feature (except bit 48; see below) */ | ||
| 3768 | BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) | | ||
| 3769 | /* reserved */ | ||
| 3770 | BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56); | ||
| 3771 | u64 vmx_basic = vmx->nested.msrs.basic; | ||
| 3772 | |||
| 3773 | if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved)) | ||
| 3774 | return -EINVAL; | ||
| 3775 | |||
| 3776 | /* | ||
| 3777 | * KVM does not emulate a version of VMX that constrains physical | ||
| 3778 | * addresses of VMX structures (e.g. VMCS) to 32-bits. | ||
| 3779 | */ | ||
| 3780 | if (data & BIT_ULL(48)) | ||
| 3781 | return -EINVAL; | ||
| 3782 | |||
| 3783 | if (vmx_basic_vmcs_revision_id(vmx_basic) != | ||
| 3784 | vmx_basic_vmcs_revision_id(data)) | ||
| 3785 | return -EINVAL; | ||
| 3786 | |||
| 3787 | if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data)) | ||
| 3788 | return -EINVAL; | ||
| 3789 | |||
| 3790 | vmx->nested.msrs.basic = data; | ||
| 3791 | return 0; | ||
| 3792 | } | ||
| 3793 | |||
| 3794 | static int | ||
| 3795 | vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) | ||
| 3796 | { | ||
| 3797 | u64 supported; | ||
| 3798 | u32 *lowp, *highp; | ||
| 3799 | |||
| 3800 | switch (msr_index) { | ||
| 3801 | case MSR_IA32_VMX_TRUE_PINBASED_CTLS: | ||
| 3802 | lowp = &vmx->nested.msrs.pinbased_ctls_low; | ||
| 3803 | highp = &vmx->nested.msrs.pinbased_ctls_high; | ||
| 3804 | break; | ||
| 3805 | case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: | ||
| 3806 | lowp = &vmx->nested.msrs.procbased_ctls_low; | ||
| 3807 | highp = &vmx->nested.msrs.procbased_ctls_high; | ||
| 3808 | break; | ||
| 3809 | case MSR_IA32_VMX_TRUE_EXIT_CTLS: | ||
| 3810 | lowp = &vmx->nested.msrs.exit_ctls_low; | ||
| 3811 | highp = &vmx->nested.msrs.exit_ctls_high; | ||
| 3812 | break; | ||
| 3813 | case MSR_IA32_VMX_TRUE_ENTRY_CTLS: | ||
| 3814 | lowp = &vmx->nested.msrs.entry_ctls_low; | ||
| 3815 | highp = &vmx->nested.msrs.entry_ctls_high; | ||
| 3816 | break; | ||
| 3817 | case MSR_IA32_VMX_PROCBASED_CTLS2: | ||
| 3818 | lowp = &vmx->nested.msrs.secondary_ctls_low; | ||
| 3819 | highp = &vmx->nested.msrs.secondary_ctls_high; | ||
| 3820 | break; | ||
| 3821 | default: | ||
| 3822 | BUG(); | ||
| 3823 | } | ||
| 3824 | |||
| 3825 | supported = vmx_control_msr(*lowp, *highp); | ||
| 3826 | |||
| 3827 | /* Check must-be-1 bits are still 1. */ | ||
| 3828 | if (!is_bitwise_subset(data, supported, GENMASK_ULL(31, 0))) | ||
| 3829 | return -EINVAL; | ||
| 3830 | |||
| 3831 | /* Check must-be-0 bits are still 0. */ | ||
| 3832 | if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32))) | ||
| 3833 | return -EINVAL; | ||
| 3834 | |||
| 3835 | *lowp = data; | ||
| 3836 | *highp = data >> 32; | ||
| 3837 | return 0; | ||
| 3838 | } | ||
| 3839 | |||
| 3840 | static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data) | ||
| 3841 | { | ||
| 3842 | const u64 feature_and_reserved_bits = | ||
| 3843 | /* feature */ | ||
| 3844 | BIT_ULL(5) | GENMASK_ULL(8, 6) | BIT_ULL(14) | BIT_ULL(15) | | ||
| 3845 | BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) | | ||
| 3846 | /* reserved */ | ||
| 3847 | GENMASK_ULL(13, 9) | BIT_ULL(31); | ||
| 3848 | u64 vmx_misc; | ||
| 3849 | |||
| 3850 | vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low, | ||
| 3851 | vmx->nested.msrs.misc_high); | ||
| 3852 | |||
| 3853 | if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits)) | ||
| 3854 | return -EINVAL; | ||
| 3855 | |||
| 3856 | if ((vmx->nested.msrs.pinbased_ctls_high & | ||
| 3857 | PIN_BASED_VMX_PREEMPTION_TIMER) && | ||
| 3858 | vmx_misc_preemption_timer_rate(data) != | ||
| 3859 | vmx_misc_preemption_timer_rate(vmx_misc)) | ||
| 3860 | return -EINVAL; | ||
| 3861 | |||
| 3862 | if (vmx_misc_cr3_count(data) > vmx_misc_cr3_count(vmx_misc)) | ||
| 3863 | return -EINVAL; | ||
| 3864 | |||
| 3865 | if (vmx_misc_max_msr(data) > vmx_misc_max_msr(vmx_misc)) | ||
| 3866 | return -EINVAL; | ||
| 3867 | |||
| 3868 | if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc)) | ||
| 3869 | return -EINVAL; | ||
| 3870 | |||
| 3871 | vmx->nested.msrs.misc_low = data; | ||
| 3872 | vmx->nested.msrs.misc_high = data >> 32; | ||
| 3873 | |||
| 3874 | /* | ||
| 3875 | * If L1 has read-only VM-exit information fields, use the | ||
| 3876 | * less permissive vmx_vmwrite_bitmap to specify write | ||
| 3877 | * permissions for the shadow VMCS. | ||
| 3878 | */ | ||
| 3879 | if (enable_shadow_vmcs && !nested_cpu_has_vmwrite_any_field(&vmx->vcpu)) | ||
| 3880 | vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap)); | ||
| 3881 | |||
| 3882 | return 0; | ||
| 3883 | } | ||
| 3884 | |||
| 3885 | static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data) | ||
| 3886 | { | ||
| 3887 | u64 vmx_ept_vpid_cap; | ||
| 3888 | |||
| 3889 | vmx_ept_vpid_cap = vmx_control_msr(vmx->nested.msrs.ept_caps, | ||
| 3890 | vmx->nested.msrs.vpid_caps); | ||
| 3891 | |||
| 3892 | /* Every bit is either reserved or a feature bit. */ | ||
| 3893 | if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL)) | ||
| 3894 | return -EINVAL; | ||
| 3895 | |||
| 3896 | vmx->nested.msrs.ept_caps = data; | ||
| 3897 | vmx->nested.msrs.vpid_caps = data >> 32; | ||
| 3898 | return 0; | ||
| 3899 | } | ||
| 3900 | |||
| 3901 | static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) | ||
| 3902 | { | ||
| 3903 | u64 *msr; | ||
| 3904 | |||
| 3905 | switch (msr_index) { | ||
| 3906 | case MSR_IA32_VMX_CR0_FIXED0: | ||
| 3907 | msr = &vmx->nested.msrs.cr0_fixed0; | ||
| 3908 | break; | ||
| 3909 | case MSR_IA32_VMX_CR4_FIXED0: | ||
| 3910 | msr = &vmx->nested.msrs.cr4_fixed0; | ||
| 3911 | break; | ||
| 3912 | default: | ||
| 3913 | BUG(); | ||
| 3914 | } | ||
| 3915 | |||
| 3916 | /* | ||
| 3917 | * 1 bits (which indicates bits which "must-be-1" during VMX operation) | ||
| 3918 | * must be 1 in the restored value. | ||
| 3919 | */ | ||
| 3920 | if (!is_bitwise_subset(data, *msr, -1ULL)) | ||
| 3921 | return -EINVAL; | ||
| 3922 | |||
| 3923 | *msr = data; | ||
| 3924 | return 0; | ||
| 3925 | } | ||
| 3926 | |||
| 3927 | /* | ||
| 3928 | * Called when userspace is restoring VMX MSRs. | ||
| 3929 | * | ||
| 3930 | * Returns 0 on success, non-0 otherwise. | ||
| 3931 | */ | ||
| 3932 | static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) | ||
| 3933 | { | ||
| 3934 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 3935 | |||
| 3936 | /* | ||
| 3937 | * Don't allow changes to the VMX capability MSRs while the vCPU | ||
| 3938 | * is in VMX operation. | ||
| 3939 | */ | ||
| 3940 | if (vmx->nested.vmxon) | ||
| 3941 | return -EBUSY; | ||
| 3942 | |||
| 3943 | switch (msr_index) { | ||
| 3944 | case MSR_IA32_VMX_BASIC: | ||
| 3945 | return vmx_restore_vmx_basic(vmx, data); | ||
| 3946 | case MSR_IA32_VMX_PINBASED_CTLS: | ||
| 3947 | case MSR_IA32_VMX_PROCBASED_CTLS: | ||
| 3948 | case MSR_IA32_VMX_EXIT_CTLS: | ||
| 3949 | case MSR_IA32_VMX_ENTRY_CTLS: | ||
| 3950 | /* | ||
| 3951 | * The "non-true" VMX capability MSRs are generated from the | ||
| 3952 | * "true" MSRs, so we do not support restoring them directly. | ||
| 3953 | * | ||
| 3954 | * If userspace wants to emulate VMX_BASIC[55]=0, userspace | ||
| 3955 | * should restore the "true" MSRs with the must-be-1 bits | ||
| 3956 | * set according to the SDM Vol 3. A.2 "RESERVED CONTROLS AND | ||
| 3957 | * DEFAULT SETTINGS". | ||
| 3958 | */ | ||
| 3959 | return -EINVAL; | ||
| 3960 | case MSR_IA32_VMX_TRUE_PINBASED_CTLS: | ||
| 3961 | case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: | ||
| 3962 | case MSR_IA32_VMX_TRUE_EXIT_CTLS: | ||
| 3963 | case MSR_IA32_VMX_TRUE_ENTRY_CTLS: | ||
| 3964 | case MSR_IA32_VMX_PROCBASED_CTLS2: | ||
| 3965 | return vmx_restore_control_msr(vmx, msr_index, data); | ||
| 3966 | case MSR_IA32_VMX_MISC: | ||
| 3967 | return vmx_restore_vmx_misc(vmx, data); | ||
| 3968 | case MSR_IA32_VMX_CR0_FIXED0: | ||
| 3969 | case MSR_IA32_VMX_CR4_FIXED0: | ||
| 3970 | return vmx_restore_fixed0_msr(vmx, msr_index, data); | ||
| 3971 | case MSR_IA32_VMX_CR0_FIXED1: | ||
| 3972 | case MSR_IA32_VMX_CR4_FIXED1: | ||
| 3973 | /* | ||
| 3974 | * These MSRs are generated based on the vCPU's CPUID, so we | ||
| 3975 | * do not support restoring them directly. | ||
| 3976 | */ | ||
| 3977 | return -EINVAL; | ||
| 3978 | case MSR_IA32_VMX_EPT_VPID_CAP: | ||
| 3979 | return vmx_restore_vmx_ept_vpid_cap(vmx, data); | ||
| 3980 | case MSR_IA32_VMX_VMCS_ENUM: | ||
| 3981 | vmx->nested.msrs.vmcs_enum = data; | ||
| 3982 | return 0; | ||
| 3983 | default: | ||
| 3984 | /* | ||
| 3985 | * The rest of the VMX capability MSRs do not support restore. | ||
| 3986 | */ | ||
| 3987 | return -EINVAL; | ||
| 3988 | } | ||
| 3989 | } | ||
| 3990 | |||
| 3991 | /* Returns 0 on success, non-0 otherwise. */ | ||
| 3992 | static int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata) | ||
| 3993 | { | ||
| 3994 | switch (msr_index) { | ||
| 3995 | case MSR_IA32_VMX_BASIC: | ||
| 3996 | *pdata = msrs->basic; | ||
| 3997 | break; | ||
| 3998 | case MSR_IA32_VMX_TRUE_PINBASED_CTLS: | ||
| 3999 | case MSR_IA32_VMX_PINBASED_CTLS: | ||
| 4000 | *pdata = vmx_control_msr( | ||
| 4001 | msrs->pinbased_ctls_low, | ||
| 4002 | msrs->pinbased_ctls_high); | ||
| 4003 | if (msr_index == MSR_IA32_VMX_PINBASED_CTLS) | ||
| 4004 | *pdata |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; | ||
| 4005 | break; | ||
| 4006 | case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: | ||
| 4007 | case MSR_IA32_VMX_PROCBASED_CTLS: | ||
| 4008 | *pdata = vmx_control_msr( | ||
| 4009 | msrs->procbased_ctls_low, | ||
| 4010 | msrs->procbased_ctls_high); | ||
| 4011 | if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS) | ||
| 4012 | *pdata |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; | ||
| 4013 | break; | ||
| 4014 | case MSR_IA32_VMX_TRUE_EXIT_CTLS: | ||
| 4015 | case MSR_IA32_VMX_EXIT_CTLS: | ||
| 4016 | *pdata = vmx_control_msr( | ||
| 4017 | msrs->exit_ctls_low, | ||
| 4018 | msrs->exit_ctls_high); | ||
| 4019 | if (msr_index == MSR_IA32_VMX_EXIT_CTLS) | ||
| 4020 | *pdata |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; | ||
| 4021 | break; | ||
| 4022 | case MSR_IA32_VMX_TRUE_ENTRY_CTLS: | ||
| 4023 | case MSR_IA32_VMX_ENTRY_CTLS: | ||
| 4024 | *pdata = vmx_control_msr( | ||
| 4025 | msrs->entry_ctls_low, | ||
| 4026 | msrs->entry_ctls_high); | ||
| 4027 | if (msr_index == MSR_IA32_VMX_ENTRY_CTLS) | ||
| 4028 | *pdata |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; | ||
| 4029 | break; | ||
| 4030 | case MSR_IA32_VMX_MISC: | ||
| 4031 | *pdata = vmx_control_msr( | ||
| 4032 | msrs->misc_low, | ||
| 4033 | msrs->misc_high); | ||
| 4034 | break; | ||
| 4035 | case MSR_IA32_VMX_CR0_FIXED0: | ||
| 4036 | *pdata = msrs->cr0_fixed0; | ||
| 4037 | break; | ||
| 4038 | case MSR_IA32_VMX_CR0_FIXED1: | ||
| 4039 | *pdata = msrs->cr0_fixed1; | ||
| 4040 | break; | ||
| 4041 | case MSR_IA32_VMX_CR4_FIXED0: | ||
| 4042 | *pdata = msrs->cr4_fixed0; | ||
| 4043 | break; | ||
| 4044 | case MSR_IA32_VMX_CR4_FIXED1: | ||
| 4045 | *pdata = msrs->cr4_fixed1; | ||
| 4046 | break; | ||
| 4047 | case MSR_IA32_VMX_VMCS_ENUM: | ||
| 4048 | *pdata = msrs->vmcs_enum; | ||
| 4049 | break; | ||
| 4050 | case MSR_IA32_VMX_PROCBASED_CTLS2: | ||
| 4051 | *pdata = vmx_control_msr( | ||
| 4052 | msrs->secondary_ctls_low, | ||
| 4053 | msrs->secondary_ctls_high); | ||
| 4054 | break; | ||
| 4055 | case MSR_IA32_VMX_EPT_VPID_CAP: | ||
| 4056 | *pdata = msrs->ept_caps | | ||
| 4057 | ((u64)msrs->vpid_caps << 32); | ||
| 4058 | break; | ||
| 4059 | case MSR_IA32_VMX_VMFUNC: | ||
| 4060 | *pdata = msrs->vmfunc_controls; | ||
| 4061 | break; | ||
| 4062 | default: | ||
| 4063 | return 1; | ||
| 4064 | } | ||
| 4065 | |||
| 4066 | return 0; | ||
| 4067 | } | ||
| 4068 | |||
| 4069 | static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu, | ||
| 4070 | uint64_t val) | ||
| 4071 | { | ||
| 4072 | uint64_t valid_bits = to_vmx(vcpu)->msr_ia32_feature_control_valid_bits; | ||
| 4073 | |||
| 4074 | return !(val & ~valid_bits); | ||
| 4075 | } | ||
| 4076 | |||
| 4077 | static int vmx_get_msr_feature(struct kvm_msr_entry *msr) | ||
| 4078 | { | ||
| 4079 | switch (msr->index) { | ||
| 4080 | case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: | ||
| 4081 | if (!nested) | ||
| 4082 | return 1; | ||
| 4083 | return vmx_get_vmx_msr(&vmcs_config.nested, msr->index, &msr->data); | ||
| 4084 | default: | ||
| 4085 | return 1; | ||
| 4086 | } | ||
| 4087 | |||
| 4088 | return 0; | ||
| 4089 | } | ||
| 4090 | |||
| 4091 | /* | ||
| 4092 | * Reads an msr value (of 'msr_index') into 'pdata'. | ||
| 4093 | * Returns 0 on success, non-0 otherwise. | ||
| 4094 | * Assumes vcpu_load() was already called. | ||
| 4095 | */ | ||
| 4096 | static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) | ||
| 4097 | { | ||
| 4098 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 4099 | struct shared_msr_entry *msr; | ||
| 4100 | |||
| 4101 | switch (msr_info->index) { | ||
| 4102 | #ifdef CONFIG_X86_64 | ||
| 4103 | case MSR_FS_BASE: | ||
| 4104 | msr_info->data = vmcs_readl(GUEST_FS_BASE); | ||
| 4105 | break; | ||
| 4106 | case MSR_GS_BASE: | ||
| 4107 | msr_info->data = vmcs_readl(GUEST_GS_BASE); | ||
| 4108 | break; | ||
| 4109 | case MSR_KERNEL_GS_BASE: | ||
| 4110 | msr_info->data = vmx_read_guest_kernel_gs_base(vmx); | ||
| 4111 | break; | ||
| 4112 | #endif | ||
| 4113 | case MSR_EFER: | ||
| 4114 | return kvm_get_msr_common(vcpu, msr_info); | ||
| 4115 | case MSR_IA32_SPEC_CTRL: | ||
| 4116 | if (!msr_info->host_initiated && | ||
| 4117 | !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) | ||
| 4118 | return 1; | ||
| 4119 | |||
| 4120 | msr_info->data = to_vmx(vcpu)->spec_ctrl; | ||
| 4121 | break; | ||
| 4122 | case MSR_IA32_ARCH_CAPABILITIES: | ||
| 4123 | if (!msr_info->host_initiated && | ||
| 4124 | !guest_cpuid_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES)) | ||
| 4125 | return 1; | ||
| 4126 | msr_info->data = to_vmx(vcpu)->arch_capabilities; | ||
| 4127 | break; | ||
| 4128 | case MSR_IA32_SYSENTER_CS: | ||
| 4129 | msr_info->data = vmcs_read32(GUEST_SYSENTER_CS); | ||
| 4130 | break; | ||
| 4131 | case MSR_IA32_SYSENTER_EIP: | ||
| 4132 | msr_info->data = vmcs_readl(GUEST_SYSENTER_EIP); | ||
| 4133 | break; | ||
| 4134 | case MSR_IA32_SYSENTER_ESP: | ||
| 4135 | msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP); | ||
| 4136 | break; | ||
| 4137 | case MSR_IA32_BNDCFGS: | ||
| 4138 | if (!kvm_mpx_supported() || | ||
| 4139 | (!msr_info->host_initiated && | ||
| 4140 | !guest_cpuid_has(vcpu, X86_FEATURE_MPX))) | ||
| 4141 | return 1; | ||
| 4142 | msr_info->data = vmcs_read64(GUEST_BNDCFGS); | ||
| 4143 | break; | ||
| 4144 | case MSR_IA32_MCG_EXT_CTL: | ||
| 4145 | if (!msr_info->host_initiated && | ||
| 4146 | !(vmx->msr_ia32_feature_control & | ||
| 4147 | FEATURE_CONTROL_LMCE)) | ||
| 4148 | return 1; | ||
| 4149 | msr_info->data = vcpu->arch.mcg_ext_ctl; | ||
| 4150 | break; | ||
| 4151 | case MSR_IA32_FEATURE_CONTROL: | ||
| 4152 | msr_info->data = vmx->msr_ia32_feature_control; | ||
| 4153 | break; | ||
| 4154 | case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: | ||
| 4155 | if (!nested_vmx_allowed(vcpu)) | ||
| 4156 | return 1; | ||
| 4157 | return vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index, | ||
| 4158 | &msr_info->data); | ||
| 4159 | case MSR_IA32_XSS: | ||
| 4160 | if (!vmx_xsaves_supported()) | ||
| 4161 | return 1; | ||
| 4162 | msr_info->data = vcpu->arch.ia32_xss; | ||
| 4163 | break; | ||
| 4164 | case MSR_TSC_AUX: | ||
| 4165 | if (!msr_info->host_initiated && | ||
| 4166 | !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP)) | ||
| 4167 | return 1; | ||
| 4168 | /* Otherwise falls through */ | ||
| 4169 | default: | ||
| 4170 | msr = find_msr_entry(vmx, msr_info->index); | ||
| 4171 | if (msr) { | ||
| 4172 | msr_info->data = msr->data; | ||
| 4173 | break; | ||
| 4174 | } | ||
| 4175 | return kvm_get_msr_common(vcpu, msr_info); | ||
| 4176 | } | ||
| 4177 | |||
| 4178 | return 0; | ||
| 4179 | } | ||
| 4180 | |||
| 4181 | static void vmx_leave_nested(struct kvm_vcpu *vcpu); | ||
| 4182 | |||
| 4183 | /* | ||
| 4184 | * Writes msr value into into the appropriate "register". | ||
| 4185 | * Returns 0 on success, non-0 otherwise. | ||
| 4186 | * Assumes vcpu_load() was already called. | ||
| 4187 | */ | ||
| 4188 | static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) | ||
| 4189 | { | ||
| 4190 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 4191 | struct shared_msr_entry *msr; | ||
| 4192 | int ret = 0; | ||
| 4193 | u32 msr_index = msr_info->index; | ||
| 4194 | u64 data = msr_info->data; | ||
| 4195 | |||
| 4196 | switch (msr_index) { | ||
| 4197 | case MSR_EFER: | ||
| 4198 | ret = kvm_set_msr_common(vcpu, msr_info); | ||
| 4199 | break; | ||
| 4200 | #ifdef CONFIG_X86_64 | ||
| 4201 | case MSR_FS_BASE: | ||
| 4202 | vmx_segment_cache_clear(vmx); | ||
| 4203 | vmcs_writel(GUEST_FS_BASE, data); | ||
| 4204 | break; | ||
| 4205 | case MSR_GS_BASE: | ||
| 4206 | vmx_segment_cache_clear(vmx); | ||
| 4207 | vmcs_writel(GUEST_GS_BASE, data); | ||
| 4208 | break; | ||
| 4209 | case MSR_KERNEL_GS_BASE: | ||
| 4210 | vmx_write_guest_kernel_gs_base(vmx, data); | ||
| 4211 | break; | ||
| 4212 | #endif | ||
| 4213 | case MSR_IA32_SYSENTER_CS: | ||
| 4214 | vmcs_write32(GUEST_SYSENTER_CS, data); | ||
| 4215 | break; | ||
| 4216 | case MSR_IA32_SYSENTER_EIP: | ||
| 4217 | vmcs_writel(GUEST_SYSENTER_EIP, data); | ||
| 4218 | break; | ||
| 4219 | case MSR_IA32_SYSENTER_ESP: | ||
| 4220 | vmcs_writel(GUEST_SYSENTER_ESP, data); | ||
| 4221 | break; | ||
| 4222 | case MSR_IA32_BNDCFGS: | ||
| 4223 | if (!kvm_mpx_supported() || | ||
| 4224 | (!msr_info->host_initiated && | ||
| 4225 | !guest_cpuid_has(vcpu, X86_FEATURE_MPX))) | ||
| 4226 | return 1; | ||
| 4227 | if (is_noncanonical_address(data & PAGE_MASK, vcpu) || | ||
| 4228 | (data & MSR_IA32_BNDCFGS_RSVD)) | ||
| 4229 | return 1; | ||
| 4230 | vmcs_write64(GUEST_BNDCFGS, data); | ||
| 4231 | break; | ||
| 4232 | case MSR_IA32_SPEC_CTRL: | ||
| 4233 | if (!msr_info->host_initiated && | ||
| 4234 | !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) | ||
| 4235 | return 1; | ||
| 4236 | |||
| 4237 | /* The STIBP bit doesn't fault even if it's not advertised */ | ||
| 4238 | if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD)) | ||
| 4239 | return 1; | ||
| 4240 | |||
| 4241 | vmx->spec_ctrl = data; | ||
| 4242 | |||
| 4243 | if (!data) | ||
| 4244 | break; | ||
| 4245 | |||
| 4246 | /* | ||
| 4247 | * For non-nested: | ||
| 4248 | * When it's written (to non-zero) for the first time, pass | ||
| 4249 | * it through. | ||
| 4250 | * | ||
| 4251 | * For nested: | ||
| 4252 | * The handling of the MSR bitmap for L2 guests is done in | ||
| 4253 | * nested_vmx_merge_msr_bitmap. We should not touch the | ||
| 4254 | * vmcs02.msr_bitmap here since it gets completely overwritten | ||
| 4255 | * in the merging. We update the vmcs01 here for L1 as well | ||
| 4256 | * since it will end up touching the MSR anyway now. | ||
| 4257 | */ | ||
| 4258 | vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, | ||
| 4259 | MSR_IA32_SPEC_CTRL, | ||
| 4260 | MSR_TYPE_RW); | ||
| 4261 | break; | ||
| 4262 | case MSR_IA32_PRED_CMD: | ||
| 4263 | if (!msr_info->host_initiated && | ||
| 4264 | !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) | ||
| 4265 | return 1; | ||
| 4266 | |||
| 4267 | if (data & ~PRED_CMD_IBPB) | ||
| 4268 | return 1; | ||
| 4269 | |||
| 4270 | if (!data) | ||
| 4271 | break; | ||
| 4272 | |||
| 4273 | wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB); | ||
| 4274 | |||
| 4275 | /* | ||
| 4276 | * For non-nested: | ||
| 4277 | * When it's written (to non-zero) for the first time, pass | ||
| 4278 | * it through. | ||
| 4279 | * | ||
| 4280 | * For nested: | ||
| 4281 | * The handling of the MSR bitmap for L2 guests is done in | ||
| 4282 | * nested_vmx_merge_msr_bitmap. We should not touch the | ||
| 4283 | * vmcs02.msr_bitmap here since it gets completely overwritten | ||
| 4284 | * in the merging. | ||
| 4285 | */ | ||
| 4286 | vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, MSR_IA32_PRED_CMD, | ||
| 4287 | MSR_TYPE_W); | ||
| 4288 | break; | ||
| 4289 | case MSR_IA32_ARCH_CAPABILITIES: | ||
| 4290 | if (!msr_info->host_initiated) | ||
| 4291 | return 1; | ||
| 4292 | vmx->arch_capabilities = data; | ||
| 4293 | break; | ||
| 4294 | case MSR_IA32_CR_PAT: | ||
| 4295 | if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { | ||
| 4296 | if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data)) | ||
| 4297 | return 1; | ||
| 4298 | vmcs_write64(GUEST_IA32_PAT, data); | ||
| 4299 | vcpu->arch.pat = data; | ||
| 4300 | break; | ||
| 4301 | } | ||
| 4302 | ret = kvm_set_msr_common(vcpu, msr_info); | ||
| 4303 | break; | ||
| 4304 | case MSR_IA32_TSC_ADJUST: | ||
| 4305 | ret = kvm_set_msr_common(vcpu, msr_info); | ||
| 4306 | break; | ||
| 4307 | case MSR_IA32_MCG_EXT_CTL: | ||
| 4308 | if ((!msr_info->host_initiated && | ||
| 4309 | !(to_vmx(vcpu)->msr_ia32_feature_control & | ||
| 4310 | FEATURE_CONTROL_LMCE)) || | ||
| 4311 | (data & ~MCG_EXT_CTL_LMCE_EN)) | ||
| 4312 | return 1; | ||
| 4313 | vcpu->arch.mcg_ext_ctl = data; | ||
| 4314 | break; | ||
| 4315 | case MSR_IA32_FEATURE_CONTROL: | ||
| 4316 | if (!vmx_feature_control_msr_valid(vcpu, data) || | ||
| 4317 | (to_vmx(vcpu)->msr_ia32_feature_control & | ||
| 4318 | FEATURE_CONTROL_LOCKED && !msr_info->host_initiated)) | ||
| 4319 | return 1; | ||
| 4320 | vmx->msr_ia32_feature_control = data; | ||
| 4321 | if (msr_info->host_initiated && data == 0) | ||
| 4322 | vmx_leave_nested(vcpu); | ||
| 4323 | break; | ||
| 4324 | case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: | ||
| 4325 | if (!msr_info->host_initiated) | ||
| 4326 | return 1; /* they are read-only */ | ||
| 4327 | if (!nested_vmx_allowed(vcpu)) | ||
| 4328 | return 1; | ||
| 4329 | return vmx_set_vmx_msr(vcpu, msr_index, data); | ||
| 4330 | case MSR_IA32_XSS: | ||
| 4331 | if (!vmx_xsaves_supported()) | ||
| 4332 | return 1; | ||
| 4333 | /* | ||
| 4334 | * The only supported bit as of Skylake is bit 8, but | ||
| 4335 | * it is not supported on KVM. | ||
| 4336 | */ | ||
| 4337 | if (data != 0) | ||
| 4338 | return 1; | ||
| 4339 | vcpu->arch.ia32_xss = data; | ||
| 4340 | if (vcpu->arch.ia32_xss != host_xss) | ||
| 4341 | add_atomic_switch_msr(vmx, MSR_IA32_XSS, | ||
| 4342 | vcpu->arch.ia32_xss, host_xss, false); | ||
| 4343 | else | ||
| 4344 | clear_atomic_switch_msr(vmx, MSR_IA32_XSS); | ||
| 4345 | break; | ||
| 4346 | case MSR_TSC_AUX: | ||
| 4347 | if (!msr_info->host_initiated && | ||
| 4348 | !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP)) | ||
| 4349 | return 1; | ||
| 4350 | /* Check reserved bit, higher 32 bits should be zero */ | ||
| 4351 | if ((data >> 32) != 0) | ||
| 4352 | return 1; | ||
| 4353 | /* Otherwise falls through */ | ||
| 4354 | default: | ||
| 4355 | msr = find_msr_entry(vmx, msr_index); | ||
| 4356 | if (msr) { | ||
| 4357 | u64 old_msr_data = msr->data; | ||
| 4358 | msr->data = data; | ||
| 4359 | if (msr - vmx->guest_msrs < vmx->save_nmsrs) { | ||
| 4360 | preempt_disable(); | ||
| 4361 | ret = kvm_set_shared_msr(msr->index, msr->data, | ||
| 4362 | msr->mask); | ||
| 4363 | preempt_enable(); | ||
| 4364 | if (ret) | ||
| 4365 | msr->data = old_msr_data; | ||
| 4366 | } | ||
| 4367 | break; | ||
| 4368 | } | ||
| 4369 | ret = kvm_set_msr_common(vcpu, msr_info); | ||
| 4370 | } | ||
| 4371 | |||
| 4372 | return ret; | ||
| 4373 | } | ||
| 4374 | |||
| 4375 | static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) | ||
| 4376 | { | ||
| 4377 | __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail); | ||
| 4378 | switch (reg) { | ||
| 4379 | case VCPU_REGS_RSP: | ||
| 4380 | vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP); | ||
| 4381 | break; | ||
| 4382 | case VCPU_REGS_RIP: | ||
| 4383 | vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP); | ||
| 4384 | break; | ||
| 4385 | case VCPU_EXREG_PDPTR: | ||
| 4386 | if (enable_ept) | ||
| 4387 | ept_save_pdptrs(vcpu); | ||
| 4388 | break; | ||
| 4389 | default: | ||
| 4390 | break; | ||
| 4391 | } | ||
| 4392 | } | ||
| 4393 | |||
| 4394 | static __init int cpu_has_kvm_support(void) | ||
| 4395 | { | ||
| 4396 | return cpu_has_vmx(); | ||
| 4397 | } | ||
| 4398 | |||
| 4399 | static __init int vmx_disabled_by_bios(void) | ||
| 4400 | { | ||
| 4401 | u64 msr; | ||
| 4402 | |||
| 4403 | rdmsrl(MSR_IA32_FEATURE_CONTROL, msr); | ||
| 4404 | if (msr & FEATURE_CONTROL_LOCKED) { | ||
| 4405 | /* launched w/ TXT and VMX disabled */ | ||
| 4406 | if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX) | ||
| 4407 | && tboot_enabled()) | ||
| 4408 | return 1; | ||
| 4409 | /* launched w/o TXT and VMX only enabled w/ TXT */ | ||
| 4410 | if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX) | ||
| 4411 | && (msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX) | ||
| 4412 | && !tboot_enabled()) { | ||
| 4413 | printk(KERN_WARNING "kvm: disable TXT in the BIOS or " | ||
| 4414 | "activate TXT before enabling KVM\n"); | ||
| 4415 | return 1; | ||
| 4416 | } | ||
| 4417 | /* launched w/o TXT and VMX disabled */ | ||
| 4418 | if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX) | ||
| 4419 | && !tboot_enabled()) | ||
| 4420 | return 1; | ||
| 4421 | } | ||
| 4422 | |||
| 4423 | return 0; | ||
| 4424 | } | ||
| 4425 | |||
| 4426 | static void kvm_cpu_vmxon(u64 addr) | ||
| 4427 | { | ||
| 4428 | cr4_set_bits(X86_CR4_VMXE); | ||
| 4429 | intel_pt_handle_vmx(1); | ||
| 4430 | |||
| 4431 | asm volatile ("vmxon %0" : : "m"(addr)); | ||
| 4432 | } | ||
| 4433 | |||
| 4434 | static int hardware_enable(void) | ||
| 4435 | { | ||
| 4436 | int cpu = raw_smp_processor_id(); | ||
| 4437 | u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); | ||
| 4438 | u64 old, test_bits; | ||
| 4439 | |||
| 4440 | if (cr4_read_shadow() & X86_CR4_VMXE) | ||
| 4441 | return -EBUSY; | ||
| 4442 | |||
| 4443 | /* | ||
| 4444 | * This can happen if we hot-added a CPU but failed to allocate | ||
| 4445 | * VP assist page for it. | ||
| 4446 | */ | ||
| 4447 | if (static_branch_unlikely(&enable_evmcs) && | ||
| 4448 | !hv_get_vp_assist_page(cpu)) | ||
| 4449 | return -EFAULT; | ||
| 4450 | |||
| 4451 | INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu)); | ||
| 4452 | INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu)); | ||
| 4453 | spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); | ||
| 4454 | |||
| 4455 | /* | ||
| 4456 | * Now we can enable the vmclear operation in kdump | ||
| 4457 | * since the loaded_vmcss_on_cpu list on this cpu | ||
| 4458 | * has been initialized. | ||
| 4459 | * | ||
| 4460 | * Though the cpu is not in VMX operation now, there | ||
| 4461 | * is no problem to enable the vmclear operation | ||
| 4462 | * for the loaded_vmcss_on_cpu list is empty! | ||
| 4463 | */ | ||
| 4464 | crash_enable_local_vmclear(cpu); | ||
| 4465 | |||
| 4466 | rdmsrl(MSR_IA32_FEATURE_CONTROL, old); | ||
| 4467 | |||
| 4468 | test_bits = FEATURE_CONTROL_LOCKED; | ||
| 4469 | test_bits |= FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; | ||
| 4470 | if (tboot_enabled()) | ||
| 4471 | test_bits |= FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX; | ||
| 4472 | |||
| 4473 | if ((old & test_bits) != test_bits) { | ||
| 4474 | /* enable and lock */ | ||
| 4475 | wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits); | ||
| 4476 | } | ||
| 4477 | kvm_cpu_vmxon(phys_addr); | ||
| 4478 | if (enable_ept) | ||
| 4479 | ept_sync_global(); | ||
| 4480 | |||
| 4481 | return 0; | ||
| 4482 | } | ||
| 4483 | |||
| 4484 | static void vmclear_local_loaded_vmcss(void) | ||
| 4485 | { | ||
| 4486 | int cpu = raw_smp_processor_id(); | ||
| 4487 | struct loaded_vmcs *v, *n; | ||
| 4488 | |||
| 4489 | list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu), | ||
| 4490 | loaded_vmcss_on_cpu_link) | ||
| 4491 | __loaded_vmcs_clear(v); | ||
| 4492 | } | ||
| 4493 | |||
| 4494 | |||
| 4495 | /* Just like cpu_vmxoff(), but with the __kvm_handle_fault_on_reboot() | ||
| 4496 | * tricks. | ||
| 4497 | */ | ||
| 4498 | static void kvm_cpu_vmxoff(void) | ||
| 4499 | { | ||
| 4500 | asm volatile (__ex("vmxoff")); | ||
| 4501 | |||
| 4502 | intel_pt_handle_vmx(0); | ||
| 4503 | cr4_clear_bits(X86_CR4_VMXE); | ||
| 4504 | } | ||
| 4505 | |||
| 4506 | static void hardware_disable(void) | ||
| 4507 | { | ||
| 4508 | vmclear_local_loaded_vmcss(); | ||
| 4509 | kvm_cpu_vmxoff(); | ||
| 4510 | } | ||
| 4511 | |||
| 4512 | static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, | ||
| 4513 | u32 msr, u32 *result) | ||
| 4514 | { | ||
| 4515 | u32 vmx_msr_low, vmx_msr_high; | ||
| 4516 | u32 ctl = ctl_min | ctl_opt; | ||
| 4517 | |||
| 4518 | rdmsr(msr, vmx_msr_low, vmx_msr_high); | ||
| 4519 | |||
| 4520 | ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */ | ||
| 4521 | ctl |= vmx_msr_low; /* bit == 1 in low word ==> must be one */ | ||
| 4522 | |||
| 4523 | /* Ensure minimum (required) set of control bits are supported. */ | ||
| 4524 | if (ctl_min & ~ctl) | ||
| 4525 | return -EIO; | ||
| 4526 | |||
| 4527 | *result = ctl; | ||
| 4528 | return 0; | ||
| 4529 | } | ||
| 4530 | |||
| 4531 | static __init bool allow_1_setting(u32 msr, u32 ctl) | ||
| 4532 | { | ||
| 4533 | u32 vmx_msr_low, vmx_msr_high; | ||
| 4534 | |||
| 4535 | rdmsr(msr, vmx_msr_low, vmx_msr_high); | ||
| 4536 | return vmx_msr_high & ctl; | ||
| 4537 | } | ||
| 4538 | |||
| 4539 | static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) | ||
| 4540 | { | ||
| 4541 | u32 vmx_msr_low, vmx_msr_high; | ||
| 4542 | u32 min, opt, min2, opt2; | ||
| 4543 | u32 _pin_based_exec_control = 0; | ||
| 4544 | u32 _cpu_based_exec_control = 0; | ||
| 4545 | u32 _cpu_based_2nd_exec_control = 0; | ||
| 4546 | u32 _vmexit_control = 0; | ||
| 4547 | u32 _vmentry_control = 0; | ||
| 4548 | |||
| 4549 | memset(vmcs_conf, 0, sizeof(*vmcs_conf)); | ||
| 4550 | min = CPU_BASED_HLT_EXITING | | ||
| 4551 | #ifdef CONFIG_X86_64 | ||
| 4552 | CPU_BASED_CR8_LOAD_EXITING | | ||
| 4553 | CPU_BASED_CR8_STORE_EXITING | | ||
| 4554 | #endif | ||
| 4555 | CPU_BASED_CR3_LOAD_EXITING | | ||
| 4556 | CPU_BASED_CR3_STORE_EXITING | | ||
| 4557 | CPU_BASED_UNCOND_IO_EXITING | | ||
| 4558 | CPU_BASED_MOV_DR_EXITING | | ||
| 4559 | CPU_BASED_USE_TSC_OFFSETING | | ||
| 4560 | CPU_BASED_MWAIT_EXITING | | ||
| 4561 | CPU_BASED_MONITOR_EXITING | | ||
| 4562 | CPU_BASED_INVLPG_EXITING | | ||
| 4563 | CPU_BASED_RDPMC_EXITING; | ||
| 4564 | |||
| 4565 | opt = CPU_BASED_TPR_SHADOW | | ||
| 4566 | CPU_BASED_USE_MSR_BITMAPS | | ||
| 4567 | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; | ||
| 4568 | if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS, | ||
| 4569 | &_cpu_based_exec_control) < 0) | ||
| 4570 | return -EIO; | ||
| 4571 | #ifdef CONFIG_X86_64 | ||
| 4572 | if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW)) | ||
| 4573 | _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING & | ||
| 4574 | ~CPU_BASED_CR8_STORE_EXITING; | ||
| 4575 | #endif | ||
| 4576 | if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) { | ||
| 4577 | min2 = 0; | ||
| 4578 | opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | | ||
| 4579 | SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | | ||
| 4580 | SECONDARY_EXEC_WBINVD_EXITING | | ||
| 4581 | SECONDARY_EXEC_ENABLE_VPID | | ||
| 4582 | SECONDARY_EXEC_ENABLE_EPT | | ||
| 4583 | SECONDARY_EXEC_UNRESTRICTED_GUEST | | ||
| 4584 | SECONDARY_EXEC_PAUSE_LOOP_EXITING | | ||
| 4585 | SECONDARY_EXEC_DESC | | ||
| 4586 | SECONDARY_EXEC_RDTSCP | | ||
| 4587 | SECONDARY_EXEC_ENABLE_INVPCID | | ||
| 4588 | SECONDARY_EXEC_APIC_REGISTER_VIRT | | ||
| 4589 | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | | ||
| 4590 | SECONDARY_EXEC_SHADOW_VMCS | | ||
| 4591 | SECONDARY_EXEC_XSAVES | | ||
| 4592 | SECONDARY_EXEC_RDSEED_EXITING | | ||
| 4593 | SECONDARY_EXEC_RDRAND_EXITING | | ||
| 4594 | SECONDARY_EXEC_ENABLE_PML | | ||
| 4595 | SECONDARY_EXEC_TSC_SCALING | | ||
| 4596 | SECONDARY_EXEC_ENABLE_VMFUNC | | ||
| 4597 | SECONDARY_EXEC_ENCLS_EXITING; | ||
| 4598 | if (adjust_vmx_controls(min2, opt2, | ||
| 4599 | MSR_IA32_VMX_PROCBASED_CTLS2, | ||
| 4600 | &_cpu_based_2nd_exec_control) < 0) | ||
| 4601 | return -EIO; | ||
| 4602 | } | ||
| 4603 | #ifndef CONFIG_X86_64 | ||
| 4604 | if (!(_cpu_based_2nd_exec_control & | ||
| 4605 | SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) | ||
| 4606 | _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW; | ||
| 4607 | #endif | ||
| 4608 | |||
| 4609 | if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW)) | ||
| 4610 | _cpu_based_2nd_exec_control &= ~( | ||
| 4611 | SECONDARY_EXEC_APIC_REGISTER_VIRT | | ||
| 4612 | SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | | ||
| 4613 | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); | ||
| 4614 | |||
| 4615 | rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP, | ||
| 4616 | &vmx_capability.ept, &vmx_capability.vpid); | ||
| 4617 | |||
| 4618 | if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) { | ||
| 4619 | /* CR3 accesses and invlpg don't need to cause VM Exits when EPT | ||
| 4620 | enabled */ | ||
| 4621 | _cpu_based_exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING | | ||
| 4622 | CPU_BASED_CR3_STORE_EXITING | | ||
| 4623 | CPU_BASED_INVLPG_EXITING); | ||
| 4624 | } else if (vmx_capability.ept) { | ||
| 4625 | vmx_capability.ept = 0; | ||
| 4626 | pr_warn_once("EPT CAP should not exist if not support " | ||
| 4627 | "1-setting enable EPT VM-execution control\n"); | ||
| 4628 | } | ||
| 4629 | if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) && | ||
| 4630 | vmx_capability.vpid) { | ||
| 4631 | vmx_capability.vpid = 0; | ||
| 4632 | pr_warn_once("VPID CAP should not exist if not support " | ||
| 4633 | "1-setting enable VPID VM-execution control\n"); | ||
| 4634 | } | ||
| 4635 | |||
| 4636 | min = VM_EXIT_SAVE_DEBUG_CONTROLS | VM_EXIT_ACK_INTR_ON_EXIT; | ||
| 4637 | #ifdef CONFIG_X86_64 | ||
| 4638 | min |= VM_EXIT_HOST_ADDR_SPACE_SIZE; | ||
| 4639 | #endif | ||
| 4640 | opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT | | ||
| 4641 | VM_EXIT_CLEAR_BNDCFGS; | ||
| 4642 | if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS, | ||
| 4643 | &_vmexit_control) < 0) | ||
| 4644 | return -EIO; | ||
| 4645 | |||
| 4646 | min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING; | ||
| 4647 | opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR | | ||
| 4648 | PIN_BASED_VMX_PREEMPTION_TIMER; | ||
| 4649 | if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS, | ||
| 4650 | &_pin_based_exec_control) < 0) | ||
| 4651 | return -EIO; | ||
| 4652 | |||
| 4653 | if (cpu_has_broken_vmx_preemption_timer()) | ||
| 4654 | _pin_based_exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; | ||
| 4655 | if (!(_cpu_based_2nd_exec_control & | ||
| 4656 | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)) | ||
| 4657 | _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR; | ||
| 4658 | |||
| 4659 | min = VM_ENTRY_LOAD_DEBUG_CONTROLS; | ||
| 4660 | opt = VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS; | ||
| 4661 | if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS, | ||
| 4662 | &_vmentry_control) < 0) | ||
| 4663 | return -EIO; | ||
| 4664 | |||
| 4665 | rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high); | ||
| 4666 | |||
| 4667 | /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */ | ||
| 4668 | if ((vmx_msr_high & 0x1fff) > PAGE_SIZE) | ||
| 4669 | return -EIO; | ||
| 4670 | |||
| 4671 | #ifdef CONFIG_X86_64 | ||
| 4672 | /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */ | ||
| 4673 | if (vmx_msr_high & (1u<<16)) | ||
| 4674 | return -EIO; | ||
| 4675 | #endif | ||
| 4676 | |||
| 4677 | /* Require Write-Back (WB) memory type for VMCS accesses. */ | ||
| 4678 | if (((vmx_msr_high >> 18) & 15) != 6) | ||
| 4679 | return -EIO; | ||
| 4680 | |||
| 4681 | vmcs_conf->size = vmx_msr_high & 0x1fff; | ||
| 4682 | vmcs_conf->order = get_order(vmcs_conf->size); | ||
| 4683 | vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff; | ||
| 4684 | |||
| 4685 | vmcs_conf->revision_id = vmx_msr_low; | ||
| 4686 | |||
| 4687 | vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control; | ||
| 4688 | vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control; | ||
| 4689 | vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control; | ||
| 4690 | vmcs_conf->vmexit_ctrl = _vmexit_control; | ||
| 4691 | vmcs_conf->vmentry_ctrl = _vmentry_control; | ||
| 4692 | |||
| 4693 | if (static_branch_unlikely(&enable_evmcs)) | ||
| 4694 | evmcs_sanitize_exec_ctrls(vmcs_conf); | ||
| 4695 | |||
| 4696 | cpu_has_load_ia32_efer = | ||
| 4697 | allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS, | ||
| 4698 | VM_ENTRY_LOAD_IA32_EFER) | ||
| 4699 | && allow_1_setting(MSR_IA32_VMX_EXIT_CTLS, | ||
| 4700 | VM_EXIT_LOAD_IA32_EFER); | ||
| 4701 | |||
| 4702 | cpu_has_load_perf_global_ctrl = | ||
| 4703 | allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS, | ||
| 4704 | VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) | ||
| 4705 | && allow_1_setting(MSR_IA32_VMX_EXIT_CTLS, | ||
| 4706 | VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL); | ||
| 4707 | |||
| 4708 | /* | ||
| 4709 | * Some cpus support VM_ENTRY_(LOAD|SAVE)_IA32_PERF_GLOBAL_CTRL | ||
| 4710 | * but due to errata below it can't be used. Workaround is to use | ||
| 4711 | * msr load mechanism to switch IA32_PERF_GLOBAL_CTRL. | ||
| 4712 | * | ||
| 4713 | * VM Exit May Incorrectly Clear IA32_PERF_GLOBAL_CTRL [34:32] | ||
| 4714 | * | ||
| 4715 | * AAK155 (model 26) | ||
| 4716 | * AAP115 (model 30) | ||
| 4717 | * AAT100 (model 37) | ||
| 4718 | * BC86,AAY89,BD102 (model 44) | ||
| 4719 | * BA97 (model 46) | ||
| 4720 | * | ||
| 4721 | */ | ||
| 4722 | if (cpu_has_load_perf_global_ctrl && boot_cpu_data.x86 == 0x6) { | ||
| 4723 | switch (boot_cpu_data.x86_model) { | ||
| 4724 | case 26: | ||
| 4725 | case 30: | ||
| 4726 | case 37: | ||
| 4727 | case 44: | ||
| 4728 | case 46: | ||
| 4729 | cpu_has_load_perf_global_ctrl = false; | ||
| 4730 | printk_once(KERN_WARNING"kvm: VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL " | ||
| 4731 | "does not work properly. Using workaround\n"); | ||
| 4732 | break; | ||
| 4733 | default: | ||
| 4734 | break; | ||
| 4735 | } | ||
| 4736 | } | ||
| 4737 | |||
| 4738 | if (boot_cpu_has(X86_FEATURE_XSAVES)) | ||
| 4739 | rdmsrl(MSR_IA32_XSS, host_xss); | ||
| 4740 | |||
| 4741 | return 0; | ||
| 4742 | } | ||
| 4743 | |||
| 4744 | static struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu) | ||
| 4745 | { | ||
| 4746 | int node = cpu_to_node(cpu); | ||
| 4747 | struct page *pages; | ||
| 4748 | struct vmcs *vmcs; | ||
| 4749 | |||
| 4750 | pages = __alloc_pages_node(node, GFP_KERNEL, vmcs_config.order); | ||
| 4751 | if (!pages) | ||
| 4752 | return NULL; | ||
| 4753 | vmcs = page_address(pages); | ||
| 4754 | memset(vmcs, 0, vmcs_config.size); | ||
| 4755 | |||
| 4756 | /* KVM supports Enlightened VMCS v1 only */ | ||
| 4757 | if (static_branch_unlikely(&enable_evmcs)) | ||
| 4758 | vmcs->hdr.revision_id = KVM_EVMCS_VERSION; | ||
| 4759 | else | ||
| 4760 | vmcs->hdr.revision_id = vmcs_config.revision_id; | ||
| 4761 | |||
| 4762 | if (shadow) | ||
| 4763 | vmcs->hdr.shadow_vmcs = 1; | ||
| 4764 | return vmcs; | ||
| 4765 | } | ||
| 4766 | |||
| 4767 | static void free_vmcs(struct vmcs *vmcs) | ||
| 4768 | { | ||
| 4769 | free_pages((unsigned long)vmcs, vmcs_config.order); | ||
| 4770 | } | ||
| 4771 | |||
| 4772 | /* | ||
| 4773 | * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded | ||
| 4774 | */ | ||
| 4775 | static void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) | ||
| 4776 | { | ||
| 4777 | if (!loaded_vmcs->vmcs) | ||
| 4778 | return; | ||
| 4779 | loaded_vmcs_clear(loaded_vmcs); | ||
| 4780 | free_vmcs(loaded_vmcs->vmcs); | ||
| 4781 | loaded_vmcs->vmcs = NULL; | ||
| 4782 | if (loaded_vmcs->msr_bitmap) | ||
| 4783 | free_page((unsigned long)loaded_vmcs->msr_bitmap); | ||
| 4784 | WARN_ON(loaded_vmcs->shadow_vmcs != NULL); | ||
| 4785 | } | ||
| 4786 | |||
| 4787 | static struct vmcs *alloc_vmcs(bool shadow) | ||
| 4788 | { | ||
| 4789 | return alloc_vmcs_cpu(shadow, raw_smp_processor_id()); | ||
| 4790 | } | ||
| 4791 | |||
| 4792 | static int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) | ||
| 4793 | { | ||
| 4794 | loaded_vmcs->vmcs = alloc_vmcs(false); | ||
| 4795 | if (!loaded_vmcs->vmcs) | ||
| 4796 | return -ENOMEM; | ||
| 4797 | |||
| 4798 | loaded_vmcs->shadow_vmcs = NULL; | ||
| 4799 | loaded_vmcs_init(loaded_vmcs); | ||
| 4800 | |||
| 4801 | if (cpu_has_vmx_msr_bitmap()) { | ||
| 4802 | loaded_vmcs->msr_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); | ||
| 4803 | if (!loaded_vmcs->msr_bitmap) | ||
| 4804 | goto out_vmcs; | ||
| 4805 | memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE); | ||
| 4806 | |||
| 4807 | if (IS_ENABLED(CONFIG_HYPERV) && | ||
| 4808 | static_branch_unlikely(&enable_evmcs) && | ||
| 4809 | (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) { | ||
| 4810 | struct hv_enlightened_vmcs *evmcs = | ||
| 4811 | (struct hv_enlightened_vmcs *)loaded_vmcs->vmcs; | ||
| 4812 | |||
| 4813 | evmcs->hv_enlightenments_control.msr_bitmap = 1; | ||
| 4814 | } | ||
| 4815 | } | ||
| 4816 | |||
| 4817 | memset(&loaded_vmcs->host_state, 0, sizeof(struct vmcs_host_state)); | ||
| 4818 | |||
| 4819 | return 0; | ||
| 4820 | |||
| 4821 | out_vmcs: | ||
| 4822 | free_loaded_vmcs(loaded_vmcs); | ||
| 4823 | return -ENOMEM; | ||
| 4824 | } | ||
| 4825 | |||
| 4826 | static void free_kvm_area(void) | ||
| 4827 | { | ||
| 4828 | int cpu; | ||
| 4829 | |||
| 4830 | for_each_possible_cpu(cpu) { | ||
| 4831 | free_vmcs(per_cpu(vmxarea, cpu)); | ||
| 4832 | per_cpu(vmxarea, cpu) = NULL; | ||
| 4833 | } | ||
| 4834 | } | ||
| 4835 | |||
| 4836 | enum vmcs_field_width { | ||
| 4837 | VMCS_FIELD_WIDTH_U16 = 0, | ||
| 4838 | VMCS_FIELD_WIDTH_U64 = 1, | ||
| 4839 | VMCS_FIELD_WIDTH_U32 = 2, | ||
| 4840 | VMCS_FIELD_WIDTH_NATURAL_WIDTH = 3 | ||
| 4841 | }; | ||
| 4842 | |||
| 4843 | static inline int vmcs_field_width(unsigned long field) | ||
| 4844 | { | ||
| 4845 | if (0x1 & field) /* the *_HIGH fields are all 32 bit */ | ||
| 4846 | return VMCS_FIELD_WIDTH_U32; | ||
| 4847 | return (field >> 13) & 0x3 ; | ||
| 4848 | } | ||
| 4849 | |||
| 4850 | static inline int vmcs_field_readonly(unsigned long field) | ||
| 4851 | { | ||
| 4852 | return (((field >> 10) & 0x3) == 1); | ||
| 4853 | } | ||
| 4854 | |||
| 4855 | static void init_vmcs_shadow_fields(void) | ||
| 4856 | { | ||
| 4857 | int i, j; | ||
| 4858 | |||
| 4859 | for (i = j = 0; i < max_shadow_read_only_fields; i++) { | ||
| 4860 | u16 field = shadow_read_only_fields[i]; | ||
| 4861 | if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && | ||
| 4862 | (i + 1 == max_shadow_read_only_fields || | ||
| 4863 | shadow_read_only_fields[i + 1] != field + 1)) | ||
| 4864 | pr_err("Missing field from shadow_read_only_field %x\n", | ||
| 4865 | field + 1); | ||
| 4866 | |||
| 4867 | clear_bit(field, vmx_vmread_bitmap); | ||
| 4868 | #ifdef CONFIG_X86_64 | ||
| 4869 | if (field & 1) | ||
| 4870 | continue; | ||
| 4871 | #endif | ||
| 4872 | if (j < i) | ||
| 4873 | shadow_read_only_fields[j] = field; | ||
| 4874 | j++; | ||
| 4875 | } | ||
| 4876 | max_shadow_read_only_fields = j; | ||
| 4877 | |||
| 4878 | for (i = j = 0; i < max_shadow_read_write_fields; i++) { | ||
| 4879 | u16 field = shadow_read_write_fields[i]; | ||
| 4880 | if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && | ||
| 4881 | (i + 1 == max_shadow_read_write_fields || | ||
| 4882 | shadow_read_write_fields[i + 1] != field + 1)) | ||
| 4883 | pr_err("Missing field from shadow_read_write_field %x\n", | ||
| 4884 | field + 1); | ||
| 4885 | |||
| 4886 | /* | ||
| 4887 | * PML and the preemption timer can be emulated, but the | ||
| 4888 | * processor cannot vmwrite to fields that don't exist | ||
| 4889 | * on bare metal. | ||
| 4890 | */ | ||
| 4891 | switch (field) { | ||
| 4892 | case GUEST_PML_INDEX: | ||
| 4893 | if (!cpu_has_vmx_pml()) | ||
| 4894 | continue; | ||
| 4895 | break; | ||
| 4896 | case VMX_PREEMPTION_TIMER_VALUE: | ||
| 4897 | if (!cpu_has_vmx_preemption_timer()) | ||
| 4898 | continue; | ||
| 4899 | break; | ||
| 4900 | case GUEST_INTR_STATUS: | ||
| 4901 | if (!cpu_has_vmx_apicv()) | ||
| 4902 | continue; | ||
| 4903 | break; | ||
| 4904 | default: | ||
| 4905 | break; | ||
| 4906 | } | ||
| 4907 | |||
| 4908 | clear_bit(field, vmx_vmwrite_bitmap); | ||
| 4909 | clear_bit(field, vmx_vmread_bitmap); | ||
| 4910 | #ifdef CONFIG_X86_64 | ||
| 4911 | if (field & 1) | ||
| 4912 | continue; | ||
| 4913 | #endif | ||
| 4914 | if (j < i) | ||
| 4915 | shadow_read_write_fields[j] = field; | ||
| 4916 | j++; | ||
| 4917 | } | ||
| 4918 | max_shadow_read_write_fields = j; | ||
| 4919 | } | ||
| 4920 | |||
| 4921 | static __init int alloc_kvm_area(void) | ||
| 4922 | { | ||
| 4923 | int cpu; | ||
| 4924 | |||
| 4925 | for_each_possible_cpu(cpu) { | ||
| 4926 | struct vmcs *vmcs; | ||
| 4927 | |||
| 4928 | vmcs = alloc_vmcs_cpu(false, cpu); | ||
| 4929 | if (!vmcs) { | ||
| 4930 | free_kvm_area(); | ||
| 4931 | return -ENOMEM; | ||
| 4932 | } | ||
| 4933 | |||
| 4934 | /* | ||
| 4935 | * When eVMCS is enabled, alloc_vmcs_cpu() sets | ||
| 4936 | * vmcs->revision_id to KVM_EVMCS_VERSION instead of | ||
| 4937 | * revision_id reported by MSR_IA32_VMX_BASIC. | ||
| 4938 | * | ||
| 4939 | * However, even though not explictly documented by | ||
| 4940 | * TLFS, VMXArea passed as VMXON argument should | ||
| 4941 | * still be marked with revision_id reported by | ||
| 4942 | * physical CPU. | ||
| 4943 | */ | ||
| 4944 | if (static_branch_unlikely(&enable_evmcs)) | ||
| 4945 | vmcs->hdr.revision_id = vmcs_config.revision_id; | ||
| 4946 | |||
| 4947 | per_cpu(vmxarea, cpu) = vmcs; | ||
| 4948 | } | ||
| 4949 | return 0; | ||
| 4950 | } | ||
| 4951 | |||
| 4952 | static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg, | ||
| 4953 | struct kvm_segment *save) | ||
| 4954 | { | ||
| 4955 | if (!emulate_invalid_guest_state) { | ||
| 4956 | /* | ||
| 4957 | * CS and SS RPL should be equal during guest entry according | ||
| 4958 | * to VMX spec, but in reality it is not always so. Since vcpu | ||
| 4959 | * is in the middle of the transition from real mode to | ||
| 4960 | * protected mode it is safe to assume that RPL 0 is a good | ||
| 4961 | * default value. | ||
| 4962 | */ | ||
| 4963 | if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS) | ||
| 4964 | save->selector &= ~SEGMENT_RPL_MASK; | ||
| 4965 | save->dpl = save->selector & SEGMENT_RPL_MASK; | ||
| 4966 | save->s = 1; | ||
| 4967 | } | ||
| 4968 | vmx_set_segment(vcpu, save, seg); | ||
| 4969 | } | ||
| 4970 | |||
| 4971 | static void enter_pmode(struct kvm_vcpu *vcpu) | ||
| 4972 | { | ||
| 4973 | unsigned long flags; | ||
| 4974 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 4975 | |||
| 4976 | /* | ||
| 4977 | * Update real mode segment cache. It may be not up-to-date if sement | ||
| 4978 | * register was written while vcpu was in a guest mode. | ||
| 4979 | */ | ||
| 4980 | vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES); | ||
| 4981 | vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS); | ||
| 4982 | vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS); | ||
| 4983 | vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS); | ||
| 4984 | vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS); | ||
| 4985 | vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS); | ||
| 4986 | |||
| 4987 | vmx->rmode.vm86_active = 0; | ||
| 4988 | |||
| 4989 | vmx_segment_cache_clear(vmx); | ||
| 4990 | |||
| 4991 | vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR); | ||
| 4992 | |||
| 4993 | flags = vmcs_readl(GUEST_RFLAGS); | ||
| 4994 | flags &= RMODE_GUEST_OWNED_EFLAGS_BITS; | ||
| 4995 | flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS; | ||
| 4996 | vmcs_writel(GUEST_RFLAGS, flags); | ||
| 4997 | |||
| 4998 | vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) | | ||
| 4999 | (vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME)); | ||
| 5000 | |||
| 5001 | update_exception_bitmap(vcpu); | ||
| 5002 | |||
| 5003 | fix_pmode_seg(vcpu, VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]); | ||
| 5004 | fix_pmode_seg(vcpu, VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]); | ||
| 5005 | fix_pmode_seg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]); | ||
| 5006 | fix_pmode_seg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]); | ||
| 5007 | fix_pmode_seg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]); | ||
| 5008 | fix_pmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]); | ||
| 5009 | } | ||
| 5010 | |||
| 5011 | static void fix_rmode_seg(int seg, struct kvm_segment *save) | ||
| 5012 | { | ||
| 5013 | const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; | ||
| 5014 | struct kvm_segment var = *save; | ||
| 5015 | |||
| 5016 | var.dpl = 0x3; | ||
| 5017 | if (seg == VCPU_SREG_CS) | ||
| 5018 | var.type = 0x3; | ||
| 5019 | |||
| 5020 | if (!emulate_invalid_guest_state) { | ||
| 5021 | var.selector = var.base >> 4; | ||
| 5022 | var.base = var.base & 0xffff0; | ||
| 5023 | var.limit = 0xffff; | ||
| 5024 | var.g = 0; | ||
| 5025 | var.db = 0; | ||
| 5026 | var.present = 1; | ||
| 5027 | var.s = 1; | ||
| 5028 | var.l = 0; | ||
| 5029 | var.unusable = 0; | ||
| 5030 | var.type = 0x3; | ||
| 5031 | var.avl = 0; | ||
| 5032 | if (save->base & 0xf) | ||
| 5033 | printk_once(KERN_WARNING "kvm: segment base is not " | ||
| 5034 | "paragraph aligned when entering " | ||
| 5035 | "protected mode (seg=%d)", seg); | ||
| 5036 | } | ||
| 5037 | |||
| 5038 | vmcs_write16(sf->selector, var.selector); | ||
| 5039 | vmcs_writel(sf->base, var.base); | ||
| 5040 | vmcs_write32(sf->limit, var.limit); | ||
| 5041 | vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(&var)); | ||
| 5042 | } | ||
| 5043 | |||
| 5044 | static void enter_rmode(struct kvm_vcpu *vcpu) | ||
| 5045 | { | ||
| 5046 | unsigned long flags; | ||
| 5047 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 5048 | struct kvm_vmx *kvm_vmx = to_kvm_vmx(vcpu->kvm); | ||
| 5049 | |||
| 5050 | vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR); | ||
| 5051 | vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES); | ||
| 5052 | vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS); | ||
| 5053 | vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS); | ||
| 5054 | vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS); | ||
| 5055 | vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS); | ||
| 5056 | vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS); | ||
| 5057 | |||
| 5058 | vmx->rmode.vm86_active = 1; | ||
| 5059 | |||
| 5060 | /* | ||
| 5061 | * Very old userspace does not call KVM_SET_TSS_ADDR before entering | ||
| 5062 | * vcpu. Warn the user that an update is overdue. | ||
| 5063 | */ | ||
| 5064 | if (!kvm_vmx->tss_addr) | ||
| 5065 | printk_once(KERN_WARNING "kvm: KVM_SET_TSS_ADDR need to be " | ||
| 5066 | "called before entering vcpu\n"); | ||
| 5067 | |||
| 5068 | vmx_segment_cache_clear(vmx); | ||
| 5069 | |||
| 5070 | vmcs_writel(GUEST_TR_BASE, kvm_vmx->tss_addr); | ||
| 5071 | vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1); | ||
| 5072 | vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); | ||
| 5073 | |||
| 5074 | flags = vmcs_readl(GUEST_RFLAGS); | ||
| 5075 | vmx->rmode.save_rflags = flags; | ||
| 5076 | |||
| 5077 | flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; | ||
| 5078 | |||
| 5079 | vmcs_writel(GUEST_RFLAGS, flags); | ||
| 5080 | vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME); | ||
| 5081 | update_exception_bitmap(vcpu); | ||
| 5082 | |||
| 5083 | fix_rmode_seg(VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]); | ||
| 5084 | fix_rmode_seg(VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]); | ||
| 5085 | fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]); | ||
| 5086 | fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]); | ||
| 5087 | fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]); | ||
| 5088 | fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]); | ||
| 5089 | |||
| 5090 | kvm_mmu_reset_context(vcpu); | ||
| 5091 | } | ||
| 5092 | |||
| 5093 | static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) | ||
| 5094 | { | ||
| 5095 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 5096 | struct shared_msr_entry *msr = find_msr_entry(vmx, MSR_EFER); | ||
| 5097 | |||
| 5098 | if (!msr) | ||
| 5099 | return; | ||
| 5100 | |||
| 5101 | vcpu->arch.efer = efer; | ||
| 5102 | if (efer & EFER_LMA) { | ||
| 5103 | vm_entry_controls_setbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE); | ||
| 5104 | msr->data = efer; | ||
| 5105 | } else { | ||
| 5106 | vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE); | ||
| 5107 | |||
| 5108 | msr->data = efer & ~EFER_LME; | ||
| 5109 | } | ||
| 5110 | setup_msrs(vmx); | ||
| 5111 | } | ||
| 5112 | |||
| 5113 | #ifdef CONFIG_X86_64 | ||
| 5114 | |||
| 5115 | static void enter_lmode(struct kvm_vcpu *vcpu) | ||
| 5116 | { | ||
| 5117 | u32 guest_tr_ar; | ||
| 5118 | |||
| 5119 | vmx_segment_cache_clear(to_vmx(vcpu)); | ||
| 5120 | |||
| 5121 | guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES); | ||
| 5122 | if ((guest_tr_ar & VMX_AR_TYPE_MASK) != VMX_AR_TYPE_BUSY_64_TSS) { | ||
| 5123 | pr_debug_ratelimited("%s: tss fixup for long mode. \n", | ||
| 5124 | __func__); | ||
| 5125 | vmcs_write32(GUEST_TR_AR_BYTES, | ||
| 5126 | (guest_tr_ar & ~VMX_AR_TYPE_MASK) | ||
| 5127 | | VMX_AR_TYPE_BUSY_64_TSS); | ||
| 5128 | } | ||
| 5129 | vmx_set_efer(vcpu, vcpu->arch.efer | EFER_LMA); | ||
| 5130 | } | ||
| 5131 | |||
| 5132 | static void exit_lmode(struct kvm_vcpu *vcpu) | ||
| 5133 | { | ||
| 5134 | vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE); | ||
| 5135 | vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA); | ||
| 5136 | } | ||
| 5137 | |||
| 5138 | #endif | ||
| 5139 | |||
| 5140 | static inline void __vmx_flush_tlb(struct kvm_vcpu *vcpu, int vpid, | ||
| 5141 | bool invalidate_gpa) | ||
| 5142 | { | ||
| 5143 | if (enable_ept && (invalidate_gpa || !enable_vpid)) { | ||
| 5144 | if (!VALID_PAGE(vcpu->arch.mmu->root_hpa)) | ||
| 5145 | return; | ||
| 5146 | ept_sync_context(construct_eptp(vcpu, | ||
| 5147 | vcpu->arch.mmu->root_hpa)); | ||
| 5148 | } else { | ||
| 5149 | vpid_sync_context(vpid); | ||
| 5150 | } | ||
| 5151 | } | ||
| 5152 | |||
| 5153 | static void vmx_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa) | ||
| 5154 | { | ||
| 5155 | __vmx_flush_tlb(vcpu, to_vmx(vcpu)->vpid, invalidate_gpa); | ||
| 5156 | } | ||
| 5157 | |||
| 5158 | static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr) | ||
| 5159 | { | ||
| 5160 | int vpid = to_vmx(vcpu)->vpid; | ||
| 5161 | |||
| 5162 | if (!vpid_sync_vcpu_addr(vpid, addr)) | ||
| 5163 | vpid_sync_context(vpid); | ||
| 5164 | |||
| 5165 | /* | ||
| 5166 | * If VPIDs are not supported or enabled, then the above is a no-op. | ||
| 5167 | * But we don't really need a TLB flush in that case anyway, because | ||
| 5168 | * each VM entry/exit includes an implicit flush when VPID is 0. | ||
| 5169 | */ | ||
| 5170 | } | ||
| 5171 | |||
| 5172 | static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) | ||
| 5173 | { | ||
| 5174 | ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits; | ||
| 5175 | |||
| 5176 | vcpu->arch.cr0 &= ~cr0_guest_owned_bits; | ||
| 5177 | vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits; | ||
| 5178 | } | ||
| 5179 | |||
| 5180 | static void vmx_decache_cr3(struct kvm_vcpu *vcpu) | ||
| 5181 | { | ||
| 5182 | if (enable_unrestricted_guest || (enable_ept && is_paging(vcpu))) | ||
| 5183 | vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); | ||
| 5184 | __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); | ||
| 5185 | } | ||
| 5186 | |||
| 5187 | static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) | ||
| 5188 | { | ||
| 5189 | ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits; | ||
| 5190 | |||
| 5191 | vcpu->arch.cr4 &= ~cr4_guest_owned_bits; | ||
| 5192 | vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & cr4_guest_owned_bits; | ||
| 5193 | } | ||
| 5194 | |||
| 5195 | static void ept_load_pdptrs(struct kvm_vcpu *vcpu) | ||
| 5196 | { | ||
| 5197 | struct kvm_mmu *mmu = vcpu->arch.walk_mmu; | ||
| 5198 | |||
| 5199 | if (!test_bit(VCPU_EXREG_PDPTR, | ||
| 5200 | (unsigned long *)&vcpu->arch.regs_dirty)) | ||
| 5201 | return; | ||
| 5202 | |||
| 5203 | if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { | ||
| 5204 | vmcs_write64(GUEST_PDPTR0, mmu->pdptrs[0]); | ||
| 5205 | vmcs_write64(GUEST_PDPTR1, mmu->pdptrs[1]); | ||
| 5206 | vmcs_write64(GUEST_PDPTR2, mmu->pdptrs[2]); | ||
| 5207 | vmcs_write64(GUEST_PDPTR3, mmu->pdptrs[3]); | ||
| 5208 | } | ||
| 5209 | } | ||
| 5210 | |||
| 5211 | static void ept_save_pdptrs(struct kvm_vcpu *vcpu) | ||
| 5212 | { | ||
| 5213 | struct kvm_mmu *mmu = vcpu->arch.walk_mmu; | ||
| 5214 | |||
| 5215 | if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { | ||
| 5216 | mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0); | ||
| 5217 | mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1); | ||
| 5218 | mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2); | ||
| 5219 | mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3); | ||
| 5220 | } | ||
| 5221 | |||
| 5222 | __set_bit(VCPU_EXREG_PDPTR, | ||
| 5223 | (unsigned long *)&vcpu->arch.regs_avail); | ||
| 5224 | __set_bit(VCPU_EXREG_PDPTR, | ||
| 5225 | (unsigned long *)&vcpu->arch.regs_dirty); | ||
| 5226 | } | ||
| 5227 | |||
| 5228 | static bool nested_guest_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val) | ||
| 5229 | { | ||
| 5230 | u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr0_fixed0; | ||
| 5231 | u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr0_fixed1; | ||
| 5232 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); | ||
| 5233 | |||
| 5234 | if (to_vmx(vcpu)->nested.msrs.secondary_ctls_high & | ||
| 5235 | SECONDARY_EXEC_UNRESTRICTED_GUEST && | ||
| 5236 | nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST)) | ||
| 5237 | fixed0 &= ~(X86_CR0_PE | X86_CR0_PG); | ||
| 5238 | |||
| 5239 | return fixed_bits_valid(val, fixed0, fixed1); | ||
| 5240 | } | ||
| 5241 | |||
| 5242 | static bool nested_host_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val) | ||
| 5243 | { | ||
| 5244 | u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr0_fixed0; | ||
| 5245 | u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr0_fixed1; | ||
| 5246 | |||
| 5247 | return fixed_bits_valid(val, fixed0, fixed1); | ||
| 5248 | } | ||
| 5249 | |||
| 5250 | static bool nested_cr4_valid(struct kvm_vcpu *vcpu, unsigned long val) | ||
| 5251 | { | ||
| 5252 | u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr4_fixed0; | ||
| 5253 | u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr4_fixed1; | ||
| 5254 | |||
| 5255 | return fixed_bits_valid(val, fixed0, fixed1); | ||
| 5256 | } | ||
| 5257 | |||
| 5258 | /* No difference in the restrictions on guest and host CR4 in VMX operation. */ | ||
| 5259 | #define nested_guest_cr4_valid nested_cr4_valid | ||
| 5260 | #define nested_host_cr4_valid nested_cr4_valid | ||
| 5261 | |||
| 5262 | static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); | ||
| 5263 | |||
| 5264 | static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, | ||
| 5265 | unsigned long cr0, | ||
| 5266 | struct kvm_vcpu *vcpu) | ||
| 5267 | { | ||
| 5268 | if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail)) | ||
| 5269 | vmx_decache_cr3(vcpu); | ||
| 5270 | if (!(cr0 & X86_CR0_PG)) { | ||
| 5271 | /* From paging/starting to nonpaging */ | ||
| 5272 | vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, | ||
| 5273 | vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) | | ||
| 5274 | (CPU_BASED_CR3_LOAD_EXITING | | ||
| 5275 | CPU_BASED_CR3_STORE_EXITING)); | ||
| 5276 | vcpu->arch.cr0 = cr0; | ||
| 5277 | vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); | ||
| 5278 | } else if (!is_paging(vcpu)) { | ||
| 5279 | /* From nonpaging to paging */ | ||
| 5280 | vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, | ||
| 5281 | vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) & | ||
| 5282 | ~(CPU_BASED_CR3_LOAD_EXITING | | ||
| 5283 | CPU_BASED_CR3_STORE_EXITING)); | ||
| 5284 | vcpu->arch.cr0 = cr0; | ||
| 5285 | vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); | ||
| 5286 | } | ||
| 5287 | |||
| 5288 | if (!(cr0 & X86_CR0_WP)) | ||
| 5289 | *hw_cr0 &= ~X86_CR0_WP; | ||
| 5290 | } | ||
| 5291 | |||
| 5292 | static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) | ||
| 5293 | { | ||
| 5294 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 5295 | unsigned long hw_cr0; | ||
| 5296 | |||
| 5297 | hw_cr0 = (cr0 & ~KVM_VM_CR0_ALWAYS_OFF); | ||
| 5298 | if (enable_unrestricted_guest) | ||
| 5299 | hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST; | ||
| 5300 | else { | ||
| 5301 | hw_cr0 |= KVM_VM_CR0_ALWAYS_ON; | ||
| 5302 | |||
| 5303 | if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE)) | ||
| 5304 | enter_pmode(vcpu); | ||
| 5305 | |||
| 5306 | if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE)) | ||
| 5307 | enter_rmode(vcpu); | ||
| 5308 | } | ||
| 5309 | |||
| 5310 | #ifdef CONFIG_X86_64 | ||
| 5311 | if (vcpu->arch.efer & EFER_LME) { | ||
| 5312 | if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) | ||
| 5313 | enter_lmode(vcpu); | ||
| 5314 | if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) | ||
| 5315 | exit_lmode(vcpu); | ||
| 5316 | } | ||
| 5317 | #endif | ||
| 5318 | |||
| 5319 | if (enable_ept && !enable_unrestricted_guest) | ||
| 5320 | ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu); | ||
| 5321 | |||
| 5322 | vmcs_writel(CR0_READ_SHADOW, cr0); | ||
| 5323 | vmcs_writel(GUEST_CR0, hw_cr0); | ||
| 5324 | vcpu->arch.cr0 = cr0; | ||
| 5325 | |||
| 5326 | /* depends on vcpu->arch.cr0 to be set to a new value */ | ||
| 5327 | vmx->emulation_required = emulation_required(vcpu); | ||
| 5328 | } | ||
| 5329 | |||
| 5330 | static int get_ept_level(struct kvm_vcpu *vcpu) | ||
| 5331 | { | ||
| 5332 | if (cpu_has_vmx_ept_5levels() && (cpuid_maxphyaddr(vcpu) > 48)) | ||
| 5333 | return 5; | ||
| 5334 | return 4; | ||
| 5335 | } | ||
| 5336 | |||
| 5337 | static u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa) | ||
| 5338 | { | ||
| 5339 | u64 eptp = VMX_EPTP_MT_WB; | ||
| 5340 | |||
| 5341 | eptp |= (get_ept_level(vcpu) == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4; | ||
| 5342 | |||
| 5343 | if (enable_ept_ad_bits && | ||
| 5344 | (!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu))) | ||
| 5345 | eptp |= VMX_EPTP_AD_ENABLE_BIT; | ||
| 5346 | eptp |= (root_hpa & PAGE_MASK); | ||
| 5347 | |||
| 5348 | return eptp; | ||
| 5349 | } | ||
| 5350 | |||
| 5351 | static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) | ||
| 5352 | { | ||
| 5353 | struct kvm *kvm = vcpu->kvm; | ||
| 5354 | unsigned long guest_cr3; | ||
| 5355 | u64 eptp; | ||
| 5356 | |||
| 5357 | guest_cr3 = cr3; | ||
| 5358 | if (enable_ept) { | ||
| 5359 | eptp = construct_eptp(vcpu, cr3); | ||
| 5360 | vmcs_write64(EPT_POINTER, eptp); | ||
| 5361 | |||
| 5362 | if (kvm_x86_ops->tlb_remote_flush) { | ||
| 5363 | spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock); | ||
| 5364 | to_vmx(vcpu)->ept_pointer = eptp; | ||
| 5365 | to_kvm_vmx(kvm)->ept_pointers_match | ||
| 5366 | = EPT_POINTERS_CHECK; | ||
| 5367 | spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock); | ||
| 5368 | } | ||
| 5369 | |||
| 5370 | if (enable_unrestricted_guest || is_paging(vcpu) || | ||
| 5371 | is_guest_mode(vcpu)) | ||
| 5372 | guest_cr3 = kvm_read_cr3(vcpu); | ||
| 5373 | else | ||
| 5374 | guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr; | ||
| 5375 | ept_load_pdptrs(vcpu); | ||
| 5376 | } | ||
| 5377 | |||
| 5378 | vmcs_writel(GUEST_CR3, guest_cr3); | ||
| 5379 | } | ||
| 5380 | |||
| 5381 | static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) | ||
| 5382 | { | ||
| 5383 | /* | ||
| 5384 | * Pass through host's Machine Check Enable value to hw_cr4, which | ||
| 5385 | * is in force while we are in guest mode. Do not let guests control | ||
| 5386 | * this bit, even if host CR4.MCE == 0. | ||
| 5387 | */ | ||
| 5388 | unsigned long hw_cr4; | ||
| 5389 | |||
| 5390 | hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE); | ||
| 5391 | if (enable_unrestricted_guest) | ||
| 5392 | hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST; | ||
| 5393 | else if (to_vmx(vcpu)->rmode.vm86_active) | ||
| 5394 | hw_cr4 |= KVM_RMODE_VM_CR4_ALWAYS_ON; | ||
| 5395 | else | ||
| 5396 | hw_cr4 |= KVM_PMODE_VM_CR4_ALWAYS_ON; | ||
| 5397 | |||
| 5398 | if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated()) { | ||
| 5399 | if (cr4 & X86_CR4_UMIP) { | ||
| 5400 | vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL, | ||
| 5401 | SECONDARY_EXEC_DESC); | ||
| 5402 | hw_cr4 &= ~X86_CR4_UMIP; | ||
| 5403 | } else if (!is_guest_mode(vcpu) || | ||
| 5404 | !nested_cpu_has2(get_vmcs12(vcpu), SECONDARY_EXEC_DESC)) | ||
| 5405 | vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, | ||
| 5406 | SECONDARY_EXEC_DESC); | ||
| 5407 | } | ||
| 5408 | |||
| 5409 | if (cr4 & X86_CR4_VMXE) { | ||
| 5410 | /* | ||
| 5411 | * To use VMXON (and later other VMX instructions), a guest | ||
| 5412 | * must first be able to turn on cr4.VMXE (see handle_vmon()). | ||
| 5413 | * So basically the check on whether to allow nested VMX | ||
| 5414 | * is here. We operate under the default treatment of SMM, | ||
| 5415 | * so VMX cannot be enabled under SMM. | ||
| 5416 | */ | ||
| 5417 | if (!nested_vmx_allowed(vcpu) || is_smm(vcpu)) | ||
| 5418 | return 1; | ||
| 5419 | } | ||
| 5420 | |||
| 5421 | if (to_vmx(vcpu)->nested.vmxon && !nested_cr4_valid(vcpu, cr4)) | ||
| 5422 | return 1; | ||
| 5423 | |||
| 5424 | vcpu->arch.cr4 = cr4; | ||
| 5425 | |||
| 5426 | if (!enable_unrestricted_guest) { | ||
| 5427 | if (enable_ept) { | ||
| 5428 | if (!is_paging(vcpu)) { | ||
| 5429 | hw_cr4 &= ~X86_CR4_PAE; | ||
| 5430 | hw_cr4 |= X86_CR4_PSE; | ||
| 5431 | } else if (!(cr4 & X86_CR4_PAE)) { | ||
| 5432 | hw_cr4 &= ~X86_CR4_PAE; | ||
| 5433 | } | ||
| 5434 | } | ||
| 5435 | |||
| 5436 | /* | ||
| 5437 | * SMEP/SMAP/PKU is disabled if CPU is in non-paging mode in | ||
| 5438 | * hardware. To emulate this behavior, SMEP/SMAP/PKU needs | ||
| 5439 | * to be manually disabled when guest switches to non-paging | ||
| 5440 | * mode. | ||
| 5441 | * | ||
| 5442 | * If !enable_unrestricted_guest, the CPU is always running | ||
| 5443 | * with CR0.PG=1 and CR4 needs to be modified. | ||
| 5444 | * If enable_unrestricted_guest, the CPU automatically | ||
| 5445 | * disables SMEP/SMAP/PKU when the guest sets CR0.PG=0. | ||
| 5446 | */ | ||
| 5447 | if (!is_paging(vcpu)) | ||
| 5448 | hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE); | ||
| 5449 | } | ||
| 5450 | |||
| 5451 | vmcs_writel(CR4_READ_SHADOW, cr4); | ||
| 5452 | vmcs_writel(GUEST_CR4, hw_cr4); | ||
| 5453 | return 0; | ||
| 5454 | } | ||
| 5455 | |||
| 5456 | static void vmx_get_segment(struct kvm_vcpu *vcpu, | ||
| 5457 | struct kvm_segment *var, int seg) | ||
| 5458 | { | ||
| 5459 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 5460 | u32 ar; | ||
| 5461 | |||
| 5462 | if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) { | ||
| 5463 | *var = vmx->rmode.segs[seg]; | ||
| 5464 | if (seg == VCPU_SREG_TR | ||
| 5465 | || var->selector == vmx_read_guest_seg_selector(vmx, seg)) | ||
| 5466 | return; | ||
| 5467 | var->base = vmx_read_guest_seg_base(vmx, seg); | ||
| 5468 | var->selector = vmx_read_guest_seg_selector(vmx, seg); | ||
| 5469 | return; | ||
| 5470 | } | ||
| 5471 | var->base = vmx_read_guest_seg_base(vmx, seg); | ||
| 5472 | var->limit = vmx_read_guest_seg_limit(vmx, seg); | ||
| 5473 | var->selector = vmx_read_guest_seg_selector(vmx, seg); | ||
| 5474 | ar = vmx_read_guest_seg_ar(vmx, seg); | ||
| 5475 | var->unusable = (ar >> 16) & 1; | ||
| 5476 | var->type = ar & 15; | ||
| 5477 | var->s = (ar >> 4) & 1; | ||
| 5478 | var->dpl = (ar >> 5) & 3; | ||
| 5479 | /* | ||
| 5480 | * Some userspaces do not preserve unusable property. Since usable | ||
| 5481 | * segment has to be present according to VMX spec we can use present | ||
| 5482 | * property to amend userspace bug by making unusable segment always | ||
| 5483 | * nonpresent. vmx_segment_access_rights() already marks nonpresent | ||
| 5484 | * segment as unusable. | ||
| 5485 | */ | ||
| 5486 | var->present = !var->unusable; | ||
| 5487 | var->avl = (ar >> 12) & 1; | ||
| 5488 | var->l = (ar >> 13) & 1; | ||
| 5489 | var->db = (ar >> 14) & 1; | ||
| 5490 | var->g = (ar >> 15) & 1; | ||
| 5491 | } | ||
| 5492 | |||
| 5493 | static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg) | ||
| 5494 | { | ||
| 5495 | struct kvm_segment s; | ||
| 5496 | |||
| 5497 | if (to_vmx(vcpu)->rmode.vm86_active) { | ||
| 5498 | vmx_get_segment(vcpu, &s, seg); | ||
| 5499 | return s.base; | ||
| 5500 | } | ||
| 5501 | return vmx_read_guest_seg_base(to_vmx(vcpu), seg); | ||
| 5502 | } | ||
| 5503 | |||
| 5504 | static int vmx_get_cpl(struct kvm_vcpu *vcpu) | ||
| 5505 | { | ||
| 5506 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 5507 | |||
| 5508 | if (unlikely(vmx->rmode.vm86_active)) | ||
| 5509 | return 0; | ||
| 5510 | else { | ||
| 5511 | int ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS); | ||
| 5512 | return VMX_AR_DPL(ar); | ||
| 5513 | } | ||
| 5514 | } | ||
| 5515 | |||
| 5516 | static u32 vmx_segment_access_rights(struct kvm_segment *var) | ||
| 5517 | { | ||
| 5518 | u32 ar; | ||
| 5519 | |||
| 5520 | if (var->unusable || !var->present) | ||
| 5521 | ar = 1 << 16; | ||
| 5522 | else { | ||
| 5523 | ar = var->type & 15; | ||
| 5524 | ar |= (var->s & 1) << 4; | ||
| 5525 | ar |= (var->dpl & 3) << 5; | ||
| 5526 | ar |= (var->present & 1) << 7; | ||
| 5527 | ar |= (var->avl & 1) << 12; | ||
| 5528 | ar |= (var->l & 1) << 13; | ||
| 5529 | ar |= (var->db & 1) << 14; | ||
| 5530 | ar |= (var->g & 1) << 15; | ||
| 5531 | } | ||
| 5532 | |||
| 5533 | return ar; | ||
| 5534 | } | ||
| 5535 | |||
| 5536 | static void vmx_set_segment(struct kvm_vcpu *vcpu, | ||
| 5537 | struct kvm_segment *var, int seg) | ||
| 5538 | { | ||
| 5539 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 5540 | const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; | ||
| 5541 | |||
| 5542 | vmx_segment_cache_clear(vmx); | ||
| 5543 | |||
| 5544 | if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) { | ||
| 5545 | vmx->rmode.segs[seg] = *var; | ||
| 5546 | if (seg == VCPU_SREG_TR) | ||
| 5547 | vmcs_write16(sf->selector, var->selector); | ||
| 5548 | else if (var->s) | ||
| 5549 | fix_rmode_seg(seg, &vmx->rmode.segs[seg]); | ||
| 5550 | goto out; | ||
| 5551 | } | ||
| 5552 | |||
| 5553 | vmcs_writel(sf->base, var->base); | ||
| 5554 | vmcs_write32(sf->limit, var->limit); | ||
| 5555 | vmcs_write16(sf->selector, var->selector); | ||
| 5556 | |||
| 5557 | /* | ||
| 5558 | * Fix the "Accessed" bit in AR field of segment registers for older | ||
| 5559 | * qemu binaries. | ||
| 5560 | * IA32 arch specifies that at the time of processor reset the | ||
| 5561 | * "Accessed" bit in the AR field of segment registers is 1. And qemu | ||
| 5562 | * is setting it to 0 in the userland code. This causes invalid guest | ||
| 5563 | * state vmexit when "unrestricted guest" mode is turned on. | ||
| 5564 | * Fix for this setup issue in cpu_reset is being pushed in the qemu | ||
| 5565 | * tree. Newer qemu binaries with that qemu fix would not need this | ||
| 5566 | * kvm hack. | ||
| 5567 | */ | ||
| 5568 | if (enable_unrestricted_guest && (seg != VCPU_SREG_LDTR)) | ||
| 5569 | var->type |= 0x1; /* Accessed */ | ||
| 5570 | |||
| 5571 | vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var)); | ||
| 5572 | |||
| 5573 | out: | ||
| 5574 | vmx->emulation_required = emulation_required(vcpu); | ||
| 5575 | } | ||
| 5576 | |||
| 5577 | static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) | ||
| 5578 | { | ||
| 5579 | u32 ar = vmx_read_guest_seg_ar(to_vmx(vcpu), VCPU_SREG_CS); | ||
| 5580 | |||
| 5581 | *db = (ar >> 14) & 1; | ||
| 5582 | *l = (ar >> 13) & 1; | ||
| 5583 | } | ||
| 5584 | |||
| 5585 | static void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) | ||
| 5586 | { | ||
| 5587 | dt->size = vmcs_read32(GUEST_IDTR_LIMIT); | ||
| 5588 | dt->address = vmcs_readl(GUEST_IDTR_BASE); | ||
| 5589 | } | ||
| 5590 | |||
| 5591 | static void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) | ||
| 5592 | { | ||
| 5593 | vmcs_write32(GUEST_IDTR_LIMIT, dt->size); | ||
| 5594 | vmcs_writel(GUEST_IDTR_BASE, dt->address); | ||
| 5595 | } | ||
| 5596 | |||
| 5597 | static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) | ||
| 5598 | { | ||
| 5599 | dt->size = vmcs_read32(GUEST_GDTR_LIMIT); | ||
| 5600 | dt->address = vmcs_readl(GUEST_GDTR_BASE); | ||
| 5601 | } | ||
| 5602 | |||
| 5603 | static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) | ||
| 5604 | { | ||
| 5605 | vmcs_write32(GUEST_GDTR_LIMIT, dt->size); | ||
| 5606 | vmcs_writel(GUEST_GDTR_BASE, dt->address); | ||
| 5607 | } | ||
| 5608 | |||
| 5609 | static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg) | ||
| 5610 | { | ||
| 5611 | struct kvm_segment var; | ||
| 5612 | u32 ar; | ||
| 5613 | |||
| 5614 | vmx_get_segment(vcpu, &var, seg); | ||
| 5615 | var.dpl = 0x3; | ||
| 5616 | if (seg == VCPU_SREG_CS) | ||
| 5617 | var.type = 0x3; | ||
| 5618 | ar = vmx_segment_access_rights(&var); | ||
| 5619 | |||
| 5620 | if (var.base != (var.selector << 4)) | ||
| 5621 | return false; | ||
| 5622 | if (var.limit != 0xffff) | ||
| 5623 | return false; | ||
| 5624 | if (ar != 0xf3) | ||
| 5625 | return false; | ||
| 5626 | |||
| 5627 | return true; | ||
| 5628 | } | ||
| 5629 | |||
| 5630 | static bool code_segment_valid(struct kvm_vcpu *vcpu) | ||
| 5631 | { | ||
| 5632 | struct kvm_segment cs; | ||
| 5633 | unsigned int cs_rpl; | ||
| 5634 | |||
| 5635 | vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); | ||
| 5636 | cs_rpl = cs.selector & SEGMENT_RPL_MASK; | ||
| 5637 | |||
| 5638 | if (cs.unusable) | ||
| 5639 | return false; | ||
| 5640 | if (~cs.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_ACCESSES_MASK)) | ||
| 5641 | return false; | ||
| 5642 | if (!cs.s) | ||
| 5643 | return false; | ||
| 5644 | if (cs.type & VMX_AR_TYPE_WRITEABLE_MASK) { | ||
| 5645 | if (cs.dpl > cs_rpl) | ||
| 5646 | return false; | ||
| 5647 | } else { | ||
| 5648 | if (cs.dpl != cs_rpl) | ||
| 5649 | return false; | ||
| 5650 | } | ||
| 5651 | if (!cs.present) | ||
| 5652 | return false; | ||
| 5653 | |||
| 5654 | /* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */ | ||
| 5655 | return true; | ||
| 5656 | } | ||
| 5657 | |||
| 5658 | static bool stack_segment_valid(struct kvm_vcpu *vcpu) | ||
| 5659 | { | ||
| 5660 | struct kvm_segment ss; | ||
| 5661 | unsigned int ss_rpl; | ||
| 5662 | |||
| 5663 | vmx_get_segment(vcpu, &ss, VCPU_SREG_SS); | ||
| 5664 | ss_rpl = ss.selector & SEGMENT_RPL_MASK; | ||
| 5665 | |||
| 5666 | if (ss.unusable) | ||
| 5667 | return true; | ||
| 5668 | if (ss.type != 3 && ss.type != 7) | ||
| 5669 | return false; | ||
| 5670 | if (!ss.s) | ||
| 5671 | return false; | ||
| 5672 | if (ss.dpl != ss_rpl) /* DPL != RPL */ | ||
| 5673 | return false; | ||
| 5674 | if (!ss.present) | ||
| 5675 | return false; | ||
| 5676 | |||
| 5677 | return true; | ||
| 5678 | } | ||
| 5679 | |||
| 5680 | static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg) | ||
| 5681 | { | ||
| 5682 | struct kvm_segment var; | ||
| 5683 | unsigned int rpl; | ||
| 5684 | |||
| 5685 | vmx_get_segment(vcpu, &var, seg); | ||
| 5686 | rpl = var.selector & SEGMENT_RPL_MASK; | ||
| 5687 | |||
| 5688 | if (var.unusable) | ||
| 5689 | return true; | ||
| 5690 | if (!var.s) | ||
| 5691 | return false; | ||
| 5692 | if (!var.present) | ||
| 5693 | return false; | ||
| 5694 | if (~var.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_WRITEABLE_MASK)) { | ||
| 5695 | if (var.dpl < rpl) /* DPL < RPL */ | ||
| 5696 | return false; | ||
| 5697 | } | ||
| 5698 | |||
| 5699 | /* TODO: Add other members to kvm_segment_field to allow checking for other access | ||
| 5700 | * rights flags | ||
| 5701 | */ | ||
| 5702 | return true; | ||
| 5703 | } | ||
| 5704 | |||
| 5705 | static bool tr_valid(struct kvm_vcpu *vcpu) | ||
| 5706 | { | ||
| 5707 | struct kvm_segment tr; | ||
| 5708 | |||
| 5709 | vmx_get_segment(vcpu, &tr, VCPU_SREG_TR); | ||
| 5710 | |||
| 5711 | if (tr.unusable) | ||
| 5712 | return false; | ||
| 5713 | if (tr.selector & SEGMENT_TI_MASK) /* TI = 1 */ | ||
| 5714 | return false; | ||
| 5715 | if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */ | ||
| 5716 | return false; | ||
| 5717 | if (!tr.present) | ||
| 5718 | return false; | ||
| 5719 | |||
| 5720 | return true; | ||
| 5721 | } | ||
| 5722 | |||
| 5723 | static bool ldtr_valid(struct kvm_vcpu *vcpu) | ||
| 5724 | { | ||
| 5725 | struct kvm_segment ldtr; | ||
| 5726 | |||
| 5727 | vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR); | ||
| 5728 | |||
| 5729 | if (ldtr.unusable) | ||
| 5730 | return true; | ||
| 5731 | if (ldtr.selector & SEGMENT_TI_MASK) /* TI = 1 */ | ||
| 5732 | return false; | ||
| 5733 | if (ldtr.type != 2) | ||
| 5734 | return false; | ||
| 5735 | if (!ldtr.present) | ||
| 5736 | return false; | ||
| 5737 | |||
| 5738 | return true; | ||
| 5739 | } | ||
| 5740 | |||
| 5741 | static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu) | ||
| 5742 | { | ||
| 5743 | struct kvm_segment cs, ss; | ||
| 5744 | |||
| 5745 | vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); | ||
| 5746 | vmx_get_segment(vcpu, &ss, VCPU_SREG_SS); | ||
| 5747 | |||
| 5748 | return ((cs.selector & SEGMENT_RPL_MASK) == | ||
| 5749 | (ss.selector & SEGMENT_RPL_MASK)); | ||
| 5750 | } | ||
| 5751 | |||
| 5752 | /* | ||
| 5753 | * Check if guest state is valid. Returns true if valid, false if | ||
| 5754 | * not. | ||
| 5755 | * We assume that registers are always usable | ||
| 5756 | */ | ||
| 5757 | static bool guest_state_valid(struct kvm_vcpu *vcpu) | ||
| 5758 | { | ||
| 5759 | if (enable_unrestricted_guest) | ||
| 5760 | return true; | ||
| 5761 | |||
| 5762 | /* real mode guest state checks */ | ||
| 5763 | if (!is_protmode(vcpu) || (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) { | ||
| 5764 | if (!rmode_segment_valid(vcpu, VCPU_SREG_CS)) | ||
| 5765 | return false; | ||
| 5766 | if (!rmode_segment_valid(vcpu, VCPU_SREG_SS)) | ||
| 5767 | return false; | ||
| 5768 | if (!rmode_segment_valid(vcpu, VCPU_SREG_DS)) | ||
| 5769 | return false; | ||
| 5770 | if (!rmode_segment_valid(vcpu, VCPU_SREG_ES)) | ||
| 5771 | return false; | ||
| 5772 | if (!rmode_segment_valid(vcpu, VCPU_SREG_FS)) | ||
| 5773 | return false; | ||
| 5774 | if (!rmode_segment_valid(vcpu, VCPU_SREG_GS)) | ||
| 5775 | return false; | ||
| 5776 | } else { | ||
| 5777 | /* protected mode guest state checks */ | ||
| 5778 | if (!cs_ss_rpl_check(vcpu)) | ||
| 5779 | return false; | ||
| 5780 | if (!code_segment_valid(vcpu)) | ||
| 5781 | return false; | ||
| 5782 | if (!stack_segment_valid(vcpu)) | ||
| 5783 | return false; | ||
| 5784 | if (!data_segment_valid(vcpu, VCPU_SREG_DS)) | ||
| 5785 | return false; | ||
| 5786 | if (!data_segment_valid(vcpu, VCPU_SREG_ES)) | ||
| 5787 | return false; | ||
| 5788 | if (!data_segment_valid(vcpu, VCPU_SREG_FS)) | ||
| 5789 | return false; | ||
| 5790 | if (!data_segment_valid(vcpu, VCPU_SREG_GS)) | ||
| 5791 | return false; | ||
| 5792 | if (!tr_valid(vcpu)) | ||
| 5793 | return false; | ||
| 5794 | if (!ldtr_valid(vcpu)) | ||
| 5795 | return false; | ||
| 5796 | } | ||
| 5797 | /* TODO: | ||
| 5798 | * - Add checks on RIP | ||
| 5799 | * - Add checks on RFLAGS | ||
| 5800 | */ | ||
| 5801 | |||
| 5802 | return true; | ||
| 5803 | } | ||
| 5804 | |||
| 5805 | static bool page_address_valid(struct kvm_vcpu *vcpu, gpa_t gpa) | ||
| 5806 | { | ||
| 5807 | return PAGE_ALIGNED(gpa) && !(gpa >> cpuid_maxphyaddr(vcpu)); | ||
| 5808 | } | ||
| 5809 | |||
| 5810 | static int init_rmode_tss(struct kvm *kvm) | ||
| 5811 | { | ||
| 5812 | gfn_t fn; | ||
| 5813 | u16 data = 0; | ||
| 5814 | int idx, r; | ||
| 5815 | |||
| 5816 | idx = srcu_read_lock(&kvm->srcu); | ||
| 5817 | fn = to_kvm_vmx(kvm)->tss_addr >> PAGE_SHIFT; | ||
| 5818 | r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE); | ||
| 5819 | if (r < 0) | ||
| 5820 | goto out; | ||
| 5821 | data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE; | ||
| 5822 | r = kvm_write_guest_page(kvm, fn++, &data, | ||
| 5823 | TSS_IOPB_BASE_OFFSET, sizeof(u16)); | ||
| 5824 | if (r < 0) | ||
| 5825 | goto out; | ||
| 5826 | r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE); | ||
| 5827 | if (r < 0) | ||
| 5828 | goto out; | ||
| 5829 | r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE); | ||
| 5830 | if (r < 0) | ||
| 5831 | goto out; | ||
| 5832 | data = ~0; | ||
| 5833 | r = kvm_write_guest_page(kvm, fn, &data, | ||
| 5834 | RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1, | ||
| 5835 | sizeof(u8)); | ||
| 5836 | out: | ||
| 5837 | srcu_read_unlock(&kvm->srcu, idx); | ||
| 5838 | return r; | ||
| 5839 | } | ||
| 5840 | |||
| 5841 | static int init_rmode_identity_map(struct kvm *kvm) | ||
| 5842 | { | ||
| 5843 | struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); | ||
| 5844 | int i, idx, r = 0; | ||
| 5845 | kvm_pfn_t identity_map_pfn; | ||
| 5846 | u32 tmp; | ||
| 5847 | |||
| 5848 | /* Protect kvm_vmx->ept_identity_pagetable_done. */ | ||
| 5849 | mutex_lock(&kvm->slots_lock); | ||
| 5850 | |||
| 5851 | if (likely(kvm_vmx->ept_identity_pagetable_done)) | ||
| 5852 | goto out2; | ||
| 5853 | |||
| 5854 | if (!kvm_vmx->ept_identity_map_addr) | ||
| 5855 | kvm_vmx->ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR; | ||
| 5856 | identity_map_pfn = kvm_vmx->ept_identity_map_addr >> PAGE_SHIFT; | ||
| 5857 | |||
| 5858 | r = __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT, | ||
| 5859 | kvm_vmx->ept_identity_map_addr, PAGE_SIZE); | ||
| 5860 | if (r < 0) | ||
| 5861 | goto out2; | ||
| 5862 | |||
| 5863 | idx = srcu_read_lock(&kvm->srcu); | ||
| 5864 | r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE); | ||
| 5865 | if (r < 0) | ||
| 5866 | goto out; | ||
| 5867 | /* Set up identity-mapping pagetable for EPT in real mode */ | ||
| 5868 | for (i = 0; i < PT32_ENT_PER_PAGE; i++) { | ||
| 5869 | tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | | ||
| 5870 | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE); | ||
| 5871 | r = kvm_write_guest_page(kvm, identity_map_pfn, | ||
| 5872 | &tmp, i * sizeof(tmp), sizeof(tmp)); | ||
| 5873 | if (r < 0) | ||
| 5874 | goto out; | ||
| 5875 | } | ||
| 5876 | kvm_vmx->ept_identity_pagetable_done = true; | ||
| 5877 | |||
| 5878 | out: | ||
| 5879 | srcu_read_unlock(&kvm->srcu, idx); | ||
| 5880 | |||
| 5881 | out2: | ||
| 5882 | mutex_unlock(&kvm->slots_lock); | ||
| 5883 | return r; | ||
| 5884 | } | ||
| 5885 | |||
| 5886 | static void seg_setup(int seg) | ||
| 5887 | { | ||
| 5888 | const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; | ||
| 5889 | unsigned int ar; | ||
| 5890 | |||
| 5891 | vmcs_write16(sf->selector, 0); | ||
| 5892 | vmcs_writel(sf->base, 0); | ||
| 5893 | vmcs_write32(sf->limit, 0xffff); | ||
| 5894 | ar = 0x93; | ||
| 5895 | if (seg == VCPU_SREG_CS) | ||
| 5896 | ar |= 0x08; /* code segment */ | ||
| 5897 | |||
| 5898 | vmcs_write32(sf->ar_bytes, ar); | ||
| 5899 | } | ||
| 5900 | |||
| 5901 | static int alloc_apic_access_page(struct kvm *kvm) | ||
| 5902 | { | ||
| 5903 | struct page *page; | ||
| 5904 | int r = 0; | ||
| 5905 | |||
| 5906 | mutex_lock(&kvm->slots_lock); | ||
| 5907 | if (kvm->arch.apic_access_page_done) | ||
| 5908 | goto out; | ||
| 5909 | r = __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, | ||
| 5910 | APIC_DEFAULT_PHYS_BASE, PAGE_SIZE); | ||
| 5911 | if (r) | ||
| 5912 | goto out; | ||
| 5913 | |||
| 5914 | page = gfn_to_page(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT); | ||
| 5915 | if (is_error_page(page)) { | ||
| 5916 | r = -EFAULT; | ||
| 5917 | goto out; | ||
| 5918 | } | ||
| 5919 | |||
| 5920 | /* | ||
| 5921 | * Do not pin the page in memory, so that memory hot-unplug | ||
| 5922 | * is able to migrate it. | ||
| 5923 | */ | ||
| 5924 | put_page(page); | ||
| 5925 | kvm->arch.apic_access_page_done = true; | ||
| 5926 | out: | ||
| 5927 | mutex_unlock(&kvm->slots_lock); | ||
| 5928 | return r; | ||
| 5929 | } | ||
| 5930 | |||
| 5931 | static int allocate_vpid(void) | ||
| 5932 | { | ||
| 5933 | int vpid; | ||
| 5934 | |||
| 5935 | if (!enable_vpid) | ||
| 5936 | return 0; | ||
| 5937 | spin_lock(&vmx_vpid_lock); | ||
| 5938 | vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS); | ||
| 5939 | if (vpid < VMX_NR_VPIDS) | ||
| 5940 | __set_bit(vpid, vmx_vpid_bitmap); | ||
| 5941 | else | ||
| 5942 | vpid = 0; | ||
| 5943 | spin_unlock(&vmx_vpid_lock); | ||
| 5944 | return vpid; | ||
| 5945 | } | ||
| 5946 | |||
| 5947 | static void free_vpid(int vpid) | ||
| 5948 | { | ||
| 5949 | if (!enable_vpid || vpid == 0) | ||
| 5950 | return; | ||
| 5951 | spin_lock(&vmx_vpid_lock); | ||
| 5952 | __clear_bit(vpid, vmx_vpid_bitmap); | ||
| 5953 | spin_unlock(&vmx_vpid_lock); | ||
| 5954 | } | ||
| 5955 | |||
| 5956 | static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, | ||
| 5957 | u32 msr, int type) | ||
| 5958 | { | ||
| 5959 | int f = sizeof(unsigned long); | ||
| 5960 | |||
| 5961 | if (!cpu_has_vmx_msr_bitmap()) | ||
| 5962 | return; | ||
| 5963 | |||
| 5964 | if (static_branch_unlikely(&enable_evmcs)) | ||
| 5965 | evmcs_touch_msr_bitmap(); | ||
| 5966 | |||
| 5967 | /* | ||
| 5968 | * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals | ||
| 5969 | * have the write-low and read-high bitmap offsets the wrong way round. | ||
| 5970 | * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. | ||
| 5971 | */ | ||
| 5972 | if (msr <= 0x1fff) { | ||
| 5973 | if (type & MSR_TYPE_R) | ||
| 5974 | /* read-low */ | ||
| 5975 | __clear_bit(msr, msr_bitmap + 0x000 / f); | ||
| 5976 | |||
| 5977 | if (type & MSR_TYPE_W) | ||
| 5978 | /* write-low */ | ||
| 5979 | __clear_bit(msr, msr_bitmap + 0x800 / f); | ||
| 5980 | |||
| 5981 | } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { | ||
| 5982 | msr &= 0x1fff; | ||
| 5983 | if (type & MSR_TYPE_R) | ||
| 5984 | /* read-high */ | ||
| 5985 | __clear_bit(msr, msr_bitmap + 0x400 / f); | ||
| 5986 | |||
| 5987 | if (type & MSR_TYPE_W) | ||
| 5988 | /* write-high */ | ||
| 5989 | __clear_bit(msr, msr_bitmap + 0xc00 / f); | ||
| 5990 | |||
| 5991 | } | ||
| 5992 | } | ||
| 5993 | |||
| 5994 | static __always_inline void vmx_enable_intercept_for_msr(unsigned long *msr_bitmap, | ||
| 5995 | u32 msr, int type) | ||
| 5996 | { | ||
| 5997 | int f = sizeof(unsigned long); | ||
| 5998 | |||
| 5999 | if (!cpu_has_vmx_msr_bitmap()) | ||
| 6000 | return; | ||
| 6001 | |||
| 6002 | if (static_branch_unlikely(&enable_evmcs)) | ||
| 6003 | evmcs_touch_msr_bitmap(); | ||
| 6004 | |||
| 6005 | /* | ||
| 6006 | * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals | ||
| 6007 | * have the write-low and read-high bitmap offsets the wrong way round. | ||
| 6008 | * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. | ||
| 6009 | */ | ||
| 6010 | if (msr <= 0x1fff) { | ||
| 6011 | if (type & MSR_TYPE_R) | ||
| 6012 | /* read-low */ | ||
| 6013 | __set_bit(msr, msr_bitmap + 0x000 / f); | ||
| 6014 | |||
| 6015 | if (type & MSR_TYPE_W) | ||
| 6016 | /* write-low */ | ||
| 6017 | __set_bit(msr, msr_bitmap + 0x800 / f); | ||
| 6018 | |||
| 6019 | } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { | ||
| 6020 | msr &= 0x1fff; | ||
| 6021 | if (type & MSR_TYPE_R) | ||
| 6022 | /* read-high */ | ||
| 6023 | __set_bit(msr, msr_bitmap + 0x400 / f); | ||
| 6024 | |||
| 6025 | if (type & MSR_TYPE_W) | ||
| 6026 | /* write-high */ | ||
| 6027 | __set_bit(msr, msr_bitmap + 0xc00 / f); | ||
| 6028 | |||
| 6029 | } | ||
| 6030 | } | ||
| 6031 | |||
| 6032 | static __always_inline void vmx_set_intercept_for_msr(unsigned long *msr_bitmap, | ||
| 6033 | u32 msr, int type, bool value) | ||
| 6034 | { | ||
| 6035 | if (value) | ||
| 6036 | vmx_enable_intercept_for_msr(msr_bitmap, msr, type); | ||
| 6037 | else | ||
| 6038 | vmx_disable_intercept_for_msr(msr_bitmap, msr, type); | ||
| 6039 | } | ||
| 6040 | |||
| 6041 | /* | ||
| 6042 | * If a msr is allowed by L0, we should check whether it is allowed by L1. | ||
| 6043 | * The corresponding bit will be cleared unless both of L0 and L1 allow it. | ||
| 6044 | */ | ||
| 6045 | static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1, | ||
| 6046 | unsigned long *msr_bitmap_nested, | ||
| 6047 | u32 msr, int type) | ||
| 6048 | { | ||
| 6049 | int f = sizeof(unsigned long); | ||
| 6050 | |||
| 6051 | /* | ||
| 6052 | * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals | ||
| 6053 | * have the write-low and read-high bitmap offsets the wrong way round. | ||
| 6054 | * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. | ||
| 6055 | */ | ||
| 6056 | if (msr <= 0x1fff) { | ||
| 6057 | if (type & MSR_TYPE_R && | ||
| 6058 | !test_bit(msr, msr_bitmap_l1 + 0x000 / f)) | ||
| 6059 | /* read-low */ | ||
| 6060 | __clear_bit(msr, msr_bitmap_nested + 0x000 / f); | ||
| 6061 | |||
| 6062 | if (type & MSR_TYPE_W && | ||
| 6063 | !test_bit(msr, msr_bitmap_l1 + 0x800 / f)) | ||
| 6064 | /* write-low */ | ||
| 6065 | __clear_bit(msr, msr_bitmap_nested + 0x800 / f); | ||
| 6066 | |||
| 6067 | } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { | ||
| 6068 | msr &= 0x1fff; | ||
| 6069 | if (type & MSR_TYPE_R && | ||
| 6070 | !test_bit(msr, msr_bitmap_l1 + 0x400 / f)) | ||
| 6071 | /* read-high */ | ||
| 6072 | __clear_bit(msr, msr_bitmap_nested + 0x400 / f); | ||
| 6073 | |||
| 6074 | if (type & MSR_TYPE_W && | ||
| 6075 | !test_bit(msr, msr_bitmap_l1 + 0xc00 / f)) | ||
| 6076 | /* write-high */ | ||
| 6077 | __clear_bit(msr, msr_bitmap_nested + 0xc00 / f); | ||
| 6078 | |||
| 6079 | } | ||
| 6080 | } | ||
| 6081 | |||
| 6082 | static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu) | ||
| 6083 | { | ||
| 6084 | u8 mode = 0; | ||
| 6085 | |||
| 6086 | if (cpu_has_secondary_exec_ctrls() && | ||
| 6087 | (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) & | ||
| 6088 | SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) { | ||
| 6089 | mode |= MSR_BITMAP_MODE_X2APIC; | ||
| 6090 | if (enable_apicv && kvm_vcpu_apicv_active(vcpu)) | ||
| 6091 | mode |= MSR_BITMAP_MODE_X2APIC_APICV; | ||
| 6092 | } | ||
| 6093 | |||
| 6094 | return mode; | ||
| 6095 | } | ||
| 6096 | |||
| 6097 | #define X2APIC_MSR(r) (APIC_BASE_MSR + ((r) >> 4)) | ||
| 6098 | |||
| 6099 | static void vmx_update_msr_bitmap_x2apic(unsigned long *msr_bitmap, | ||
| 6100 | u8 mode) | ||
| 6101 | { | ||
| 6102 | int msr; | ||
| 6103 | |||
| 6104 | for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { | ||
| 6105 | unsigned word = msr / BITS_PER_LONG; | ||
| 6106 | msr_bitmap[word] = (mode & MSR_BITMAP_MODE_X2APIC_APICV) ? 0 : ~0; | ||
| 6107 | msr_bitmap[word + (0x800 / sizeof(long))] = ~0; | ||
| 6108 | } | ||
| 6109 | |||
| 6110 | if (mode & MSR_BITMAP_MODE_X2APIC) { | ||
| 6111 | /* | ||
| 6112 | * TPR reads and writes can be virtualized even if virtual interrupt | ||
| 6113 | * delivery is not in use. | ||
| 6114 | */ | ||
| 6115 | vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW); | ||
| 6116 | if (mode & MSR_BITMAP_MODE_X2APIC_APICV) { | ||
| 6117 | vmx_enable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_R); | ||
| 6118 | vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_EOI), MSR_TYPE_W); | ||
| 6119 | vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W); | ||
| 6120 | } | ||
| 6121 | } | ||
| 6122 | } | ||
| 6123 | |||
| 6124 | static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu) | ||
| 6125 | { | ||
| 6126 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 6127 | unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap; | ||
| 6128 | u8 mode = vmx_msr_bitmap_mode(vcpu); | ||
| 6129 | u8 changed = mode ^ vmx->msr_bitmap_mode; | ||
| 6130 | |||
| 6131 | if (!changed) | ||
| 6132 | return; | ||
| 6133 | |||
| 6134 | if (changed & (MSR_BITMAP_MODE_X2APIC | MSR_BITMAP_MODE_X2APIC_APICV)) | ||
| 6135 | vmx_update_msr_bitmap_x2apic(msr_bitmap, mode); | ||
| 6136 | |||
| 6137 | vmx->msr_bitmap_mode = mode; | ||
| 6138 | } | ||
| 6139 | |||
| 6140 | static bool vmx_get_enable_apicv(struct kvm_vcpu *vcpu) | ||
| 6141 | { | ||
| 6142 | return enable_apicv; | ||
| 6143 | } | ||
| 6144 | |||
| 6145 | static void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu) | ||
| 6146 | { | ||
| 6147 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); | ||
| 6148 | gfn_t gfn; | ||
| 6149 | |||
| 6150 | /* | ||
| 6151 | * Don't need to mark the APIC access page dirty; it is never | ||
| 6152 | * written to by the CPU during APIC virtualization. | ||
| 6153 | */ | ||
| 6154 | |||
| 6155 | if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { | ||
| 6156 | gfn = vmcs12->virtual_apic_page_addr >> PAGE_SHIFT; | ||
| 6157 | kvm_vcpu_mark_page_dirty(vcpu, gfn); | ||
| 6158 | } | ||
| 6159 | |||
| 6160 | if (nested_cpu_has_posted_intr(vmcs12)) { | ||
| 6161 | gfn = vmcs12->posted_intr_desc_addr >> PAGE_SHIFT; | ||
| 6162 | kvm_vcpu_mark_page_dirty(vcpu, gfn); | ||
| 6163 | } | ||
| 6164 | } | ||
| 6165 | |||
| 6166 | |||
| 6167 | static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) | ||
| 6168 | { | ||
| 6169 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 6170 | int max_irr; | ||
| 6171 | void *vapic_page; | ||
| 6172 | u16 status; | ||
| 6173 | |||
| 6174 | if (!vmx->nested.pi_desc || !vmx->nested.pi_pending) | ||
| 6175 | return; | ||
| 6176 | |||
| 6177 | vmx->nested.pi_pending = false; | ||
| 6178 | if (!pi_test_and_clear_on(vmx->nested.pi_desc)) | ||
| 6179 | return; | ||
| 6180 | |||
| 6181 | max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256); | ||
| 6182 | if (max_irr != 256) { | ||
| 6183 | vapic_page = kmap(vmx->nested.virtual_apic_page); | ||
| 6184 | __kvm_apic_update_irr(vmx->nested.pi_desc->pir, | ||
| 6185 | vapic_page, &max_irr); | ||
| 6186 | kunmap(vmx->nested.virtual_apic_page); | ||
| 6187 | |||
| 6188 | status = vmcs_read16(GUEST_INTR_STATUS); | ||
| 6189 | if ((u8)max_irr > ((u8)status & 0xff)) { | ||
| 6190 | status &= ~0xff; | ||
| 6191 | status |= (u8)max_irr; | ||
| 6192 | vmcs_write16(GUEST_INTR_STATUS, status); | ||
| 6193 | } | ||
| 6194 | } | ||
| 6195 | |||
| 6196 | nested_mark_vmcs12_pages_dirty(vcpu); | ||
| 6197 | } | ||
| 6198 | |||
| 6199 | static u8 vmx_get_rvi(void) | ||
| 6200 | { | ||
| 6201 | return vmcs_read16(GUEST_INTR_STATUS) & 0xff; | ||
| 6202 | } | ||
| 6203 | |||
| 6204 | static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu) | ||
| 6205 | { | ||
| 6206 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 6207 | void *vapic_page; | ||
| 6208 | u32 vppr; | ||
| 6209 | int rvi; | ||
| 6210 | |||
| 6211 | if (WARN_ON_ONCE(!is_guest_mode(vcpu)) || | ||
| 6212 | !nested_cpu_has_vid(get_vmcs12(vcpu)) || | ||
| 6213 | WARN_ON_ONCE(!vmx->nested.virtual_apic_page)) | ||
| 6214 | return false; | ||
| 6215 | |||
| 6216 | rvi = vmx_get_rvi(); | ||
| 6217 | |||
| 6218 | vapic_page = kmap(vmx->nested.virtual_apic_page); | ||
| 6219 | vppr = *((u32 *)(vapic_page + APIC_PROCPRI)); | ||
| 6220 | kunmap(vmx->nested.virtual_apic_page); | ||
| 6221 | |||
| 6222 | return ((rvi & 0xf0) > (vppr & 0xf0)); | ||
| 6223 | } | ||
| 6224 | |||
| 6225 | static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu, | ||
| 6226 | bool nested) | ||
| 6227 | { | ||
| 6228 | #ifdef CONFIG_SMP | ||
| 6229 | int pi_vec = nested ? POSTED_INTR_NESTED_VECTOR : POSTED_INTR_VECTOR; | ||
| 6230 | |||
| 6231 | if (vcpu->mode == IN_GUEST_MODE) { | ||
| 6232 | /* | ||
| 6233 | * The vector of interrupt to be delivered to vcpu had | ||
| 6234 | * been set in PIR before this function. | ||
| 6235 | * | ||
| 6236 | * Following cases will be reached in this block, and | ||
| 6237 | * we always send a notification event in all cases as | ||
| 6238 | * explained below. | ||
| 6239 | * | ||
| 6240 | * Case 1: vcpu keeps in non-root mode. Sending a | ||
| 6241 | * notification event posts the interrupt to vcpu. | ||
| 6242 | * | ||
| 6243 | * Case 2: vcpu exits to root mode and is still | ||
| 6244 | * runnable. PIR will be synced to vIRR before the | ||
| 6245 | * next vcpu entry. Sending a notification event in | ||
| 6246 | * this case has no effect, as vcpu is not in root | ||
| 6247 | * mode. | ||
| 6248 | * | ||
| 6249 | * Case 3: vcpu exits to root mode and is blocked. | ||
| 6250 | * vcpu_block() has already synced PIR to vIRR and | ||
| 6251 | * never blocks vcpu if vIRR is not cleared. Therefore, | ||
| 6252 | * a blocked vcpu here does not wait for any requested | ||
| 6253 | * interrupts in PIR, and sending a notification event | ||
| 6254 | * which has no effect is safe here. | ||
| 6255 | */ | ||
| 6256 | |||
| 6257 | apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec); | ||
| 6258 | return true; | ||
| 6259 | } | ||
| 6260 | #endif | ||
| 6261 | return false; | ||
| 6262 | } | ||
| 6263 | |||
| 6264 | static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu, | ||
| 6265 | int vector) | ||
| 6266 | { | ||
| 6267 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 6268 | |||
| 6269 | if (is_guest_mode(vcpu) && | ||
| 6270 | vector == vmx->nested.posted_intr_nv) { | ||
| 6271 | /* | ||
| 6272 | * If a posted intr is not recognized by hardware, | ||
| 6273 | * we will accomplish it in the next vmentry. | ||
| 6274 | */ | ||
| 6275 | vmx->nested.pi_pending = true; | ||
| 6276 | kvm_make_request(KVM_REQ_EVENT, vcpu); | ||
| 6277 | /* the PIR and ON have been set by L1. */ | ||
| 6278 | if (!kvm_vcpu_trigger_posted_interrupt(vcpu, true)) | ||
| 6279 | kvm_vcpu_kick(vcpu); | ||
| 6280 | return 0; | ||
| 6281 | } | ||
| 6282 | return -1; | ||
| 6283 | } | ||
| 6284 | /* | ||
| 6285 | * Send interrupt to vcpu via posted interrupt way. | ||
| 6286 | * 1. If target vcpu is running(non-root mode), send posted interrupt | ||
| 6287 | * notification to vcpu and hardware will sync PIR to vIRR atomically. | ||
| 6288 | * 2. If target vcpu isn't running(root mode), kick it to pick up the | ||
| 6289 | * interrupt from PIR in next vmentry. | ||
| 6290 | */ | ||
| 6291 | static void vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector) | ||
| 6292 | { | ||
| 6293 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 6294 | int r; | ||
| 6295 | |||
| 6296 | r = vmx_deliver_nested_posted_interrupt(vcpu, vector); | ||
| 6297 | if (!r) | ||
| 6298 | return; | ||
| 6299 | |||
| 6300 | if (pi_test_and_set_pir(vector, &vmx->pi_desc)) | ||
| 6301 | return; | ||
| 6302 | |||
| 6303 | /* If a previous notification has sent the IPI, nothing to do. */ | ||
| 6304 | if (pi_test_and_set_on(&vmx->pi_desc)) | ||
| 6305 | return; | ||
| 6306 | |||
| 6307 | if (!kvm_vcpu_trigger_posted_interrupt(vcpu, false)) | ||
| 6308 | kvm_vcpu_kick(vcpu); | ||
| 6309 | } | ||
| 6310 | |||
| 6311 | /* | ||
| 6312 | * Set up the vmcs's constant host-state fields, i.e., host-state fields that | ||
| 6313 | * will not change in the lifetime of the guest. | ||
| 6314 | * Note that host-state that does change is set elsewhere. E.g., host-state | ||
| 6315 | * that is set differently for each CPU is set in vmx_vcpu_load(), not here. | ||
| 6316 | */ | ||
| 6317 | static void vmx_set_constant_host_state(struct vcpu_vmx *vmx) | ||
| 6318 | { | ||
| 6319 | u32 low32, high32; | ||
| 6320 | unsigned long tmpl; | ||
| 6321 | struct desc_ptr dt; | ||
| 6322 | unsigned long cr0, cr3, cr4; | ||
| 6323 | |||
| 6324 | cr0 = read_cr0(); | ||
| 6325 | WARN_ON(cr0 & X86_CR0_TS); | ||
| 6326 | vmcs_writel(HOST_CR0, cr0); /* 22.2.3 */ | ||
| 6327 | |||
| 6328 | /* | ||
| 6329 | * Save the most likely value for this task's CR3 in the VMCS. | ||
| 6330 | * We can't use __get_current_cr3_fast() because we're not atomic. | ||
| 6331 | */ | ||
| 6332 | cr3 = __read_cr3(); | ||
| 6333 | vmcs_writel(HOST_CR3, cr3); /* 22.2.3 FIXME: shadow tables */ | ||
| 6334 | vmx->loaded_vmcs->host_state.cr3 = cr3; | ||
| 6335 | |||
| 6336 | /* Save the most likely value for this task's CR4 in the VMCS. */ | ||
| 6337 | cr4 = cr4_read_shadow(); | ||
| 6338 | vmcs_writel(HOST_CR4, cr4); /* 22.2.3, 22.2.5 */ | ||
| 6339 | vmx->loaded_vmcs->host_state.cr4 = cr4; | ||
| 6340 | |||
| 6341 | vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ | ||
| 6342 | #ifdef CONFIG_X86_64 | ||
| 6343 | /* | ||
| 6344 | * Load null selectors, so we can avoid reloading them in | ||
| 6345 | * vmx_prepare_switch_to_host(), in case userspace uses | ||
| 6346 | * the null selectors too (the expected case). | ||
| 6347 | */ | ||
| 6348 | vmcs_write16(HOST_DS_SELECTOR, 0); | ||
| 6349 | vmcs_write16(HOST_ES_SELECTOR, 0); | ||
| 6350 | #else | ||
| 6351 | vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ | ||
| 6352 | vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */ | ||
| 6353 | #endif | ||
| 6354 | vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ | ||
| 6355 | vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */ | ||
| 6356 | |||
| 6357 | store_idt(&dt); | ||
| 6358 | vmcs_writel(HOST_IDTR_BASE, dt.address); /* 22.2.4 */ | ||
| 6359 | vmx->host_idt_base = dt.address; | ||
| 6360 | |||
| 6361 | vmcs_writel(HOST_RIP, vmx_return); /* 22.2.5 */ | ||
| 6362 | |||
| 6363 | rdmsr(MSR_IA32_SYSENTER_CS, low32, high32); | ||
| 6364 | vmcs_write32(HOST_IA32_SYSENTER_CS, low32); | ||
| 6365 | rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl); | ||
| 6366 | vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl); /* 22.2.3 */ | ||
| 6367 | |||
| 6368 | if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) { | ||
| 6369 | rdmsr(MSR_IA32_CR_PAT, low32, high32); | ||
| 6370 | vmcs_write64(HOST_IA32_PAT, low32 | ((u64) high32 << 32)); | ||
| 6371 | } | ||
| 6372 | |||
| 6373 | if (cpu_has_load_ia32_efer) | ||
| 6374 | vmcs_write64(HOST_IA32_EFER, host_efer); | ||
| 6375 | } | ||
| 6376 | |||
| 6377 | static void set_cr4_guest_host_mask(struct vcpu_vmx *vmx) | ||
| 6378 | { | ||
| 6379 | vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS; | ||
| 6380 | if (enable_ept) | ||
| 6381 | vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE; | ||
| 6382 | if (is_guest_mode(&vmx->vcpu)) | ||
| 6383 | vmx->vcpu.arch.cr4_guest_owned_bits &= | ||
| 6384 | ~get_vmcs12(&vmx->vcpu)->cr4_guest_host_mask; | ||
| 6385 | vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits); | ||
| 6386 | } | ||
| 6387 | |||
| 6388 | static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx) | ||
| 6389 | { | ||
| 6390 | u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl; | ||
| 6391 | |||
| 6392 | if (!kvm_vcpu_apicv_active(&vmx->vcpu)) | ||
| 6393 | pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR; | ||
| 6394 | |||
| 6395 | if (!enable_vnmi) | ||
| 6396 | pin_based_exec_ctrl &= ~PIN_BASED_VIRTUAL_NMIS; | ||
| 6397 | |||
| 6398 | /* Enable the preemption timer dynamically */ | ||
| 6399 | pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER; | ||
| 6400 | return pin_based_exec_ctrl; | ||
| 6401 | } | ||
| 6402 | |||
| 6403 | static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) | ||
| 6404 | { | ||
| 6405 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 6406 | |||
| 6407 | vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx)); | ||
| 6408 | if (cpu_has_secondary_exec_ctrls()) { | ||
| 6409 | if (kvm_vcpu_apicv_active(vcpu)) | ||
| 6410 | vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL, | ||
| 6411 | SECONDARY_EXEC_APIC_REGISTER_VIRT | | ||
| 6412 | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); | ||
| 6413 | else | ||
| 6414 | vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, | ||
| 6415 | SECONDARY_EXEC_APIC_REGISTER_VIRT | | ||
| 6416 | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); | ||
| 6417 | } | ||
| 6418 | |||
| 6419 | if (cpu_has_vmx_msr_bitmap()) | ||
| 6420 | vmx_update_msr_bitmap(vcpu); | ||
| 6421 | } | ||
| 6422 | |||
| 6423 | static u32 vmx_exec_control(struct vcpu_vmx *vmx) | ||
| 6424 | { | ||
| 6425 | u32 exec_control = vmcs_config.cpu_based_exec_ctrl; | ||
| 6426 | |||
| 6427 | if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT) | ||
| 6428 | exec_control &= ~CPU_BASED_MOV_DR_EXITING; | ||
| 6429 | |||
| 6430 | if (!cpu_need_tpr_shadow(&vmx->vcpu)) { | ||
| 6431 | exec_control &= ~CPU_BASED_TPR_SHADOW; | ||
| 6432 | #ifdef CONFIG_X86_64 | ||
| 6433 | exec_control |= CPU_BASED_CR8_STORE_EXITING | | ||
| 6434 | CPU_BASED_CR8_LOAD_EXITING; | ||
| 6435 | #endif | ||
| 6436 | } | ||
| 6437 | if (!enable_ept) | ||
| 6438 | exec_control |= CPU_BASED_CR3_STORE_EXITING | | ||
| 6439 | CPU_BASED_CR3_LOAD_EXITING | | ||
| 6440 | CPU_BASED_INVLPG_EXITING; | ||
| 6441 | if (kvm_mwait_in_guest(vmx->vcpu.kvm)) | ||
| 6442 | exec_control &= ~(CPU_BASED_MWAIT_EXITING | | ||
| 6443 | CPU_BASED_MONITOR_EXITING); | ||
| 6444 | if (kvm_hlt_in_guest(vmx->vcpu.kvm)) | ||
| 6445 | exec_control &= ~CPU_BASED_HLT_EXITING; | ||
| 6446 | return exec_control; | ||
| 6447 | } | ||
| 6448 | |||
| 6449 | static bool vmx_rdrand_supported(void) | ||
| 6450 | { | ||
| 6451 | return vmcs_config.cpu_based_2nd_exec_ctrl & | ||
| 6452 | SECONDARY_EXEC_RDRAND_EXITING; | ||
| 6453 | } | ||
| 6454 | |||
| 6455 | static bool vmx_rdseed_supported(void) | ||
| 6456 | { | ||
| 6457 | return vmcs_config.cpu_based_2nd_exec_ctrl & | ||
| 6458 | SECONDARY_EXEC_RDSEED_EXITING; | ||
| 6459 | } | ||
| 6460 | |||
| 6461 | static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) | ||
| 6462 | { | ||
| 6463 | struct kvm_vcpu *vcpu = &vmx->vcpu; | ||
| 6464 | |||
| 6465 | u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl; | ||
| 6466 | |||
| 6467 | if (!cpu_need_virtualize_apic_accesses(vcpu)) | ||
| 6468 | exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; | ||
| 6469 | if (vmx->vpid == 0) | ||
| 6470 | exec_control &= ~SECONDARY_EXEC_ENABLE_VPID; | ||
| 6471 | if (!enable_ept) { | ||
| 6472 | exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; | ||
| 6473 | enable_unrestricted_guest = 0; | ||
| 6474 | } | ||
| 6475 | if (!enable_unrestricted_guest) | ||
| 6476 | exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; | ||
| 6477 | if (kvm_pause_in_guest(vmx->vcpu.kvm)) | ||
| 6478 | exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING; | ||
| 6479 | if (!kvm_vcpu_apicv_active(vcpu)) | ||
| 6480 | exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT | | ||
| 6481 | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); | ||
| 6482 | exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; | ||
| 6483 | |||
| 6484 | /* SECONDARY_EXEC_DESC is enabled/disabled on writes to CR4.UMIP, | ||
| 6485 | * in vmx_set_cr4. */ | ||
| 6486 | exec_control &= ~SECONDARY_EXEC_DESC; | ||
| 6487 | |||
| 6488 | /* SECONDARY_EXEC_SHADOW_VMCS is enabled when L1 executes VMPTRLD | ||
| 6489 | (handle_vmptrld). | ||
| 6490 | We can NOT enable shadow_vmcs here because we don't have yet | ||
| 6491 | a current VMCS12 | ||
| 6492 | */ | ||
| 6493 | exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; | ||
| 6494 | |||
| 6495 | if (!enable_pml) | ||
| 6496 | exec_control &= ~SECONDARY_EXEC_ENABLE_PML; | ||
| 6497 | |||
| 6498 | if (vmx_xsaves_supported()) { | ||
| 6499 | /* Exposing XSAVES only when XSAVE is exposed */ | ||
| 6500 | bool xsaves_enabled = | ||
| 6501 | guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && | ||
| 6502 | guest_cpuid_has(vcpu, X86_FEATURE_XSAVES); | ||
| 6503 | |||
| 6504 | if (!xsaves_enabled) | ||
| 6505 | exec_control &= ~SECONDARY_EXEC_XSAVES; | ||
| 6506 | |||
| 6507 | if (nested) { | ||
| 6508 | if (xsaves_enabled) | ||
| 6509 | vmx->nested.msrs.secondary_ctls_high |= | ||
| 6510 | SECONDARY_EXEC_XSAVES; | ||
| 6511 | else | ||
| 6512 | vmx->nested.msrs.secondary_ctls_high &= | ||
| 6513 | ~SECONDARY_EXEC_XSAVES; | ||
| 6514 | } | ||
| 6515 | } | ||
| 6516 | |||
| 6517 | if (vmx_rdtscp_supported()) { | ||
| 6518 | bool rdtscp_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP); | ||
| 6519 | if (!rdtscp_enabled) | ||
| 6520 | exec_control &= ~SECONDARY_EXEC_RDTSCP; | ||
| 6521 | |||
| 6522 | if (nested) { | ||
| 6523 | if (rdtscp_enabled) | ||
| 6524 | vmx->nested.msrs.secondary_ctls_high |= | ||
| 6525 | SECONDARY_EXEC_RDTSCP; | ||
| 6526 | else | ||
| 6527 | vmx->nested.msrs.secondary_ctls_high &= | ||
| 6528 | ~SECONDARY_EXEC_RDTSCP; | ||
| 6529 | } | ||
| 6530 | } | ||
| 6531 | |||
| 6532 | if (vmx_invpcid_supported()) { | ||
| 6533 | /* Exposing INVPCID only when PCID is exposed */ | ||
| 6534 | bool invpcid_enabled = | ||
| 6535 | guest_cpuid_has(vcpu, X86_FEATURE_INVPCID) && | ||
| 6536 | guest_cpuid_has(vcpu, X86_FEATURE_PCID); | ||
| 6537 | |||
| 6538 | if (!invpcid_enabled) { | ||
| 6539 | exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID; | ||
| 6540 | guest_cpuid_clear(vcpu, X86_FEATURE_INVPCID); | ||
| 6541 | } | ||
| 6542 | |||
| 6543 | if (nested) { | ||
| 6544 | if (invpcid_enabled) | ||
| 6545 | vmx->nested.msrs.secondary_ctls_high |= | ||
| 6546 | SECONDARY_EXEC_ENABLE_INVPCID; | ||
| 6547 | else | ||
| 6548 | vmx->nested.msrs.secondary_ctls_high &= | ||
| 6549 | ~SECONDARY_EXEC_ENABLE_INVPCID; | ||
| 6550 | } | ||
| 6551 | } | ||
| 6552 | |||
| 6553 | if (vmx_rdrand_supported()) { | ||
| 6554 | bool rdrand_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDRAND); | ||
| 6555 | if (rdrand_enabled) | ||
| 6556 | exec_control &= ~SECONDARY_EXEC_RDRAND_EXITING; | ||
| 6557 | |||
| 6558 | if (nested) { | ||
| 6559 | if (rdrand_enabled) | ||
| 6560 | vmx->nested.msrs.secondary_ctls_high |= | ||
| 6561 | SECONDARY_EXEC_RDRAND_EXITING; | ||
| 6562 | else | ||
| 6563 | vmx->nested.msrs.secondary_ctls_high &= | ||
| 6564 | ~SECONDARY_EXEC_RDRAND_EXITING; | ||
| 6565 | } | ||
| 6566 | } | ||
| 6567 | |||
| 6568 | if (vmx_rdseed_supported()) { | ||
| 6569 | bool rdseed_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDSEED); | ||
| 6570 | if (rdseed_enabled) | ||
| 6571 | exec_control &= ~SECONDARY_EXEC_RDSEED_EXITING; | ||
| 6572 | |||
| 6573 | if (nested) { | ||
| 6574 | if (rdseed_enabled) | ||
| 6575 | vmx->nested.msrs.secondary_ctls_high |= | ||
| 6576 | SECONDARY_EXEC_RDSEED_EXITING; | ||
| 6577 | else | ||
| 6578 | vmx->nested.msrs.secondary_ctls_high &= | ||
| 6579 | ~SECONDARY_EXEC_RDSEED_EXITING; | ||
| 6580 | } | ||
| 6581 | } | ||
| 6582 | |||
| 6583 | vmx->secondary_exec_control = exec_control; | ||
| 6584 | } | ||
| 6585 | |||
| 6586 | static void ept_set_mmio_spte_mask(void) | ||
| 6587 | { | ||
| 6588 | /* | ||
| 6589 | * EPT Misconfigurations can be generated if the value of bits 2:0 | ||
| 6590 | * of an EPT paging-structure entry is 110b (write/execute). | ||
| 6591 | */ | ||
| 6592 | kvm_mmu_set_mmio_spte_mask(VMX_EPT_RWX_MASK, | ||
| 6593 | VMX_EPT_MISCONFIG_WX_VALUE); | ||
| 6594 | } | ||
| 6595 | |||
| 6596 | #define VMX_XSS_EXIT_BITMAP 0 | ||
| 6597 | /* | ||
| 6598 | * Sets up the vmcs for emulated real mode. | ||
| 6599 | */ | ||
| 6600 | static void vmx_vcpu_setup(struct vcpu_vmx *vmx) | ||
| 6601 | { | ||
| 6602 | int i; | ||
| 6603 | |||
| 6604 | if (enable_shadow_vmcs) { | ||
| 6605 | /* | ||
| 6606 | * At vCPU creation, "VMWRITE to any supported field | ||
| 6607 | * in the VMCS" is supported, so use the more | ||
| 6608 | * permissive vmx_vmread_bitmap to specify both read | ||
| 6609 | * and write permissions for the shadow VMCS. | ||
| 6610 | */ | ||
| 6611 | vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap)); | ||
| 6612 | vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmread_bitmap)); | ||
| 6613 | } | ||
| 6614 | if (cpu_has_vmx_msr_bitmap()) | ||
| 6615 | vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap)); | ||
| 6616 | |||
| 6617 | vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */ | ||
| 6618 | |||
| 6619 | /* Control */ | ||
| 6620 | vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx)); | ||
| 6621 | vmx->hv_deadline_tsc = -1; | ||
| 6622 | |||
| 6623 | vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx)); | ||
| 6624 | |||
| 6625 | if (cpu_has_secondary_exec_ctrls()) { | ||
| 6626 | vmx_compute_secondary_exec_control(vmx); | ||
| 6627 | vmcs_write32(SECONDARY_VM_EXEC_CONTROL, | ||
| 6628 | vmx->secondary_exec_control); | ||
| 6629 | } | ||
| 6630 | |||
| 6631 | if (kvm_vcpu_apicv_active(&vmx->vcpu)) { | ||
| 6632 | vmcs_write64(EOI_EXIT_BITMAP0, 0); | ||
| 6633 | vmcs_write64(EOI_EXIT_BITMAP1, 0); | ||
| 6634 | vmcs_write64(EOI_EXIT_BITMAP2, 0); | ||
| 6635 | vmcs_write64(EOI_EXIT_BITMAP3, 0); | ||
| 6636 | |||
| 6637 | vmcs_write16(GUEST_INTR_STATUS, 0); | ||
| 6638 | |||
| 6639 | vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR); | ||
| 6640 | vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc))); | ||
| 6641 | } | ||
| 6642 | |||
| 6643 | if (!kvm_pause_in_guest(vmx->vcpu.kvm)) { | ||
| 6644 | vmcs_write32(PLE_GAP, ple_gap); | ||
| 6645 | vmx->ple_window = ple_window; | ||
| 6646 | vmx->ple_window_dirty = true; | ||
| 6647 | } | ||
| 6648 | |||
| 6649 | vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); | ||
| 6650 | vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); | ||
| 6651 | vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ | ||
| 6652 | |||
| 6653 | vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */ | ||
| 6654 | vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */ | ||
| 6655 | vmx_set_constant_host_state(vmx); | ||
| 6656 | vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */ | ||
| 6657 | vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */ | ||
| 6658 | |||
| 6659 | if (cpu_has_vmx_vmfunc()) | ||
| 6660 | vmcs_write64(VM_FUNCTION_CONTROL, 0); | ||
| 6661 | |||
| 6662 | vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); | ||
| 6663 | vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); | ||
| 6664 | vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val)); | ||
| 6665 | vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); | ||
| 6666 | vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val)); | ||
| 6667 | |||
| 6668 | if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) | ||
| 6669 | vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); | ||
| 6670 | |||
| 6671 | for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) { | ||
| 6672 | u32 index = vmx_msr_index[i]; | ||
| 6673 | u32 data_low, data_high; | ||
| 6674 | int j = vmx->nmsrs; | ||
| 6675 | |||
| 6676 | if (rdmsr_safe(index, &data_low, &data_high) < 0) | ||
| 6677 | continue; | ||
| 6678 | if (wrmsr_safe(index, data_low, data_high) < 0) | ||
| 6679 | continue; | ||
| 6680 | vmx->guest_msrs[j].index = i; | ||
| 6681 | vmx->guest_msrs[j].data = 0; | ||
| 6682 | vmx->guest_msrs[j].mask = -1ull; | ||
| 6683 | ++vmx->nmsrs; | ||
| 6684 | } | ||
| 6685 | |||
| 6686 | vmx->arch_capabilities = kvm_get_arch_capabilities(); | ||
| 6687 | |||
| 6688 | vm_exit_controls_init(vmx, vmcs_config.vmexit_ctrl); | ||
| 6689 | |||
| 6690 | /* 22.2.1, 20.8.1 */ | ||
| 6691 | vm_entry_controls_init(vmx, vmcs_config.vmentry_ctrl); | ||
| 6692 | |||
| 6693 | vmx->vcpu.arch.cr0_guest_owned_bits = X86_CR0_TS; | ||
| 6694 | vmcs_writel(CR0_GUEST_HOST_MASK, ~X86_CR0_TS); | ||
| 6695 | |||
| 6696 | set_cr4_guest_host_mask(vmx); | ||
| 6697 | |||
| 6698 | if (vmx_xsaves_supported()) | ||
| 6699 | vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP); | ||
| 6700 | |||
| 6701 | if (enable_pml) { | ||
| 6702 | vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); | ||
| 6703 | vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); | ||
| 6704 | } | ||
| 6705 | |||
| 6706 | if (cpu_has_vmx_encls_vmexit()) | ||
| 6707 | vmcs_write64(ENCLS_EXITING_BITMAP, -1ull); | ||
| 6708 | } | ||
| 6709 | |||
| 6710 | static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) | ||
| 6711 | { | ||
| 6712 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 6713 | struct msr_data apic_base_msr; | ||
| 6714 | u64 cr0; | ||
| 6715 | |||
| 6716 | vmx->rmode.vm86_active = 0; | ||
| 6717 | vmx->spec_ctrl = 0; | ||
| 6718 | |||
| 6719 | vcpu->arch.microcode_version = 0x100000000ULL; | ||
| 6720 | vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val(); | ||
| 6721 | kvm_set_cr8(vcpu, 0); | ||
| 6722 | |||
| 6723 | if (!init_event) { | ||
| 6724 | apic_base_msr.data = APIC_DEFAULT_PHYS_BASE | | ||
| 6725 | MSR_IA32_APICBASE_ENABLE; | ||
| 6726 | if (kvm_vcpu_is_reset_bsp(vcpu)) | ||
| 6727 | apic_base_msr.data |= MSR_IA32_APICBASE_BSP; | ||
| 6728 | apic_base_msr.host_initiated = true; | ||
| 6729 | kvm_set_apic_base(vcpu, &apic_base_msr); | ||
| 6730 | } | ||
| 6731 | |||
| 6732 | vmx_segment_cache_clear(vmx); | ||
| 6733 | |||
| 6734 | seg_setup(VCPU_SREG_CS); | ||
| 6735 | vmcs_write16(GUEST_CS_SELECTOR, 0xf000); | ||
| 6736 | vmcs_writel(GUEST_CS_BASE, 0xffff0000ul); | ||
| 6737 | |||
| 6738 | seg_setup(VCPU_SREG_DS); | ||
| 6739 | seg_setup(VCPU_SREG_ES); | ||
| 6740 | seg_setup(VCPU_SREG_FS); | ||
| 6741 | seg_setup(VCPU_SREG_GS); | ||
| 6742 | seg_setup(VCPU_SREG_SS); | ||
| 6743 | |||
| 6744 | vmcs_write16(GUEST_TR_SELECTOR, 0); | ||
| 6745 | vmcs_writel(GUEST_TR_BASE, 0); | ||
| 6746 | vmcs_write32(GUEST_TR_LIMIT, 0xffff); | ||
| 6747 | vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); | ||
| 6748 | |||
| 6749 | vmcs_write16(GUEST_LDTR_SELECTOR, 0); | ||
| 6750 | vmcs_writel(GUEST_LDTR_BASE, 0); | ||
| 6751 | vmcs_write32(GUEST_LDTR_LIMIT, 0xffff); | ||
| 6752 | vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082); | ||
| 6753 | |||
| 6754 | if (!init_event) { | ||
| 6755 | vmcs_write32(GUEST_SYSENTER_CS, 0); | ||
| 6756 | vmcs_writel(GUEST_SYSENTER_ESP, 0); | ||
| 6757 | vmcs_writel(GUEST_SYSENTER_EIP, 0); | ||
| 6758 | vmcs_write64(GUEST_IA32_DEBUGCTL, 0); | ||
| 6759 | } | ||
| 6760 | |||
| 6761 | kvm_set_rflags(vcpu, X86_EFLAGS_FIXED); | ||
| 6762 | kvm_rip_write(vcpu, 0xfff0); | ||
| 6763 | |||
| 6764 | vmcs_writel(GUEST_GDTR_BASE, 0); | ||
| 6765 | vmcs_write32(GUEST_GDTR_LIMIT, 0xffff); | ||
| 6766 | |||
| 6767 | vmcs_writel(GUEST_IDTR_BASE, 0); | ||
| 6768 | vmcs_write32(GUEST_IDTR_LIMIT, 0xffff); | ||
| 6769 | |||
| 6770 | vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); | ||
| 6771 | vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0); | ||
| 6772 | vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 0); | ||
| 6773 | if (kvm_mpx_supported()) | ||
| 6774 | vmcs_write64(GUEST_BNDCFGS, 0); | ||
| 6775 | |||
| 6776 | setup_msrs(vmx); | ||
| 6777 | |||
| 6778 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */ | ||
| 6779 | |||
| 6780 | if (cpu_has_vmx_tpr_shadow() && !init_event) { | ||
| 6781 | vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0); | ||
| 6782 | if (cpu_need_tpr_shadow(vcpu)) | ||
| 6783 | vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, | ||
| 6784 | __pa(vcpu->arch.apic->regs)); | ||
| 6785 | vmcs_write32(TPR_THRESHOLD, 0); | ||
| 6786 | } | ||
| 6787 | |||
| 6788 | kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); | ||
| 6789 | |||
| 6790 | if (vmx->vpid != 0) | ||
| 6791 | vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); | ||
| 6792 | |||
| 6793 | cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET; | ||
| 6794 | vmx->vcpu.arch.cr0 = cr0; | ||
| 6795 | vmx_set_cr0(vcpu, cr0); /* enter rmode */ | ||
| 6796 | vmx_set_cr4(vcpu, 0); | ||
| 6797 | vmx_set_efer(vcpu, 0); | ||
| 6798 | |||
| 6799 | update_exception_bitmap(vcpu); | ||
| 6800 | |||
| 6801 | vpid_sync_context(vmx->vpid); | ||
| 6802 | if (init_event) | ||
| 6803 | vmx_clear_hlt(vcpu); | ||
| 6804 | } | ||
| 6805 | |||
| 6806 | /* | ||
| 6807 | * In nested virtualization, check if L1 asked to exit on external interrupts. | ||
| 6808 | * For most existing hypervisors, this will always return true. | ||
| 6809 | */ | ||
| 6810 | static bool nested_exit_on_intr(struct kvm_vcpu *vcpu) | ||
| 6811 | { | ||
| 6812 | return get_vmcs12(vcpu)->pin_based_vm_exec_control & | ||
| 6813 | PIN_BASED_EXT_INTR_MASK; | ||
| 6814 | } | ||
| 6815 | |||
| 6816 | /* | ||
| 6817 | * In nested virtualization, check if L1 has set | ||
| 6818 | * VM_EXIT_ACK_INTR_ON_EXIT | ||
| 6819 | */ | ||
| 6820 | static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu) | ||
| 6821 | { | ||
| 6822 | return get_vmcs12(vcpu)->vm_exit_controls & | ||
| 6823 | VM_EXIT_ACK_INTR_ON_EXIT; | ||
| 6824 | } | ||
| 6825 | |||
| 6826 | static bool nested_exit_on_nmi(struct kvm_vcpu *vcpu) | ||
| 6827 | { | ||
| 6828 | return nested_cpu_has_nmi_exiting(get_vmcs12(vcpu)); | ||
| 6829 | } | ||
| 6830 | |||
| 6831 | static void enable_irq_window(struct kvm_vcpu *vcpu) | ||
| 6832 | { | ||
| 6833 | vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL, | ||
| 6834 | CPU_BASED_VIRTUAL_INTR_PENDING); | ||
| 6835 | } | ||
| 6836 | |||
| 6837 | static void enable_nmi_window(struct kvm_vcpu *vcpu) | ||
| 6838 | { | ||
| 6839 | if (!enable_vnmi || | ||
| 6840 | vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) { | ||
| 6841 | enable_irq_window(vcpu); | ||
| 6842 | return; | ||
| 6843 | } | ||
| 6844 | |||
| 6845 | vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL, | ||
| 6846 | CPU_BASED_VIRTUAL_NMI_PENDING); | ||
| 6847 | } | ||
| 6848 | |||
| 6849 | static void vmx_inject_irq(struct kvm_vcpu *vcpu) | ||
| 6850 | { | ||
| 6851 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 6852 | uint32_t intr; | ||
| 6853 | int irq = vcpu->arch.interrupt.nr; | ||
| 6854 | |||
| 6855 | trace_kvm_inj_virq(irq); | ||
| 6856 | |||
| 6857 | ++vcpu->stat.irq_injections; | ||
| 6858 | if (vmx->rmode.vm86_active) { | ||
| 6859 | int inc_eip = 0; | ||
| 6860 | if (vcpu->arch.interrupt.soft) | ||
| 6861 | inc_eip = vcpu->arch.event_exit_inst_len; | ||
| 6862 | if (kvm_inject_realmode_interrupt(vcpu, irq, inc_eip) != EMULATE_DONE) | ||
| 6863 | kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); | ||
| 6864 | return; | ||
| 6865 | } | ||
| 6866 | intr = irq | INTR_INFO_VALID_MASK; | ||
| 6867 | if (vcpu->arch.interrupt.soft) { | ||
| 6868 | intr |= INTR_TYPE_SOFT_INTR; | ||
| 6869 | vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, | ||
| 6870 | vmx->vcpu.arch.event_exit_inst_len); | ||
| 6871 | } else | ||
| 6872 | intr |= INTR_TYPE_EXT_INTR; | ||
| 6873 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr); | ||
| 6874 | |||
| 6875 | vmx_clear_hlt(vcpu); | ||
| 6876 | } | ||
| 6877 | |||
| 6878 | static void vmx_inject_nmi(struct kvm_vcpu *vcpu) | ||
| 6879 | { | ||
| 6880 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 6881 | |||
| 6882 | if (!enable_vnmi) { | ||
| 6883 | /* | ||
| 6884 | * Tracking the NMI-blocked state in software is built upon | ||
| 6885 | * finding the next open IRQ window. This, in turn, depends on | ||
| 6886 | * well-behaving guests: They have to keep IRQs disabled at | ||
| 6887 | * least as long as the NMI handler runs. Otherwise we may | ||
| 6888 | * cause NMI nesting, maybe breaking the guest. But as this is | ||
| 6889 | * highly unlikely, we can live with the residual risk. | ||
| 6890 | */ | ||
| 6891 | vmx->loaded_vmcs->soft_vnmi_blocked = 1; | ||
| 6892 | vmx->loaded_vmcs->vnmi_blocked_time = 0; | ||
| 6893 | } | ||
| 6894 | |||
| 6895 | ++vcpu->stat.nmi_injections; | ||
| 6896 | vmx->loaded_vmcs->nmi_known_unmasked = false; | ||
| 6897 | |||
| 6898 | if (vmx->rmode.vm86_active) { | ||
| 6899 | if (kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0) != EMULATE_DONE) | ||
| 6900 | kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); | ||
| 6901 | return; | ||
| 6902 | } | ||
| 6903 | |||
| 6904 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, | ||
| 6905 | INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); | ||
| 6906 | |||
| 6907 | vmx_clear_hlt(vcpu); | ||
| 6908 | } | ||
| 6909 | |||
| 6910 | static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu) | ||
| 6911 | { | ||
| 6912 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 6913 | bool masked; | ||
| 6914 | |||
| 6915 | if (!enable_vnmi) | ||
| 6916 | return vmx->loaded_vmcs->soft_vnmi_blocked; | ||
| 6917 | if (vmx->loaded_vmcs->nmi_known_unmasked) | ||
| 6918 | return false; | ||
| 6919 | masked = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI; | ||
| 6920 | vmx->loaded_vmcs->nmi_known_unmasked = !masked; | ||
| 6921 | return masked; | ||
| 6922 | } | ||
| 6923 | |||
| 6924 | static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) | ||
| 6925 | { | ||
| 6926 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 6927 | |||
| 6928 | if (!enable_vnmi) { | ||
| 6929 | if (vmx->loaded_vmcs->soft_vnmi_blocked != masked) { | ||
| 6930 | vmx->loaded_vmcs->soft_vnmi_blocked = masked; | ||
| 6931 | vmx->loaded_vmcs->vnmi_blocked_time = 0; | ||
| 6932 | } | ||
| 6933 | } else { | ||
| 6934 | vmx->loaded_vmcs->nmi_known_unmasked = !masked; | ||
| 6935 | if (masked) | ||
| 6936 | vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, | ||
| 6937 | GUEST_INTR_STATE_NMI); | ||
| 6938 | else | ||
| 6939 | vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO, | ||
| 6940 | GUEST_INTR_STATE_NMI); | ||
| 6941 | } | ||
| 6942 | } | ||
| 6943 | |||
| 6944 | static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) | ||
| 6945 | { | ||
| 6946 | if (to_vmx(vcpu)->nested.nested_run_pending) | ||
| 6947 | return 0; | ||
| 6948 | |||
| 6949 | if (!enable_vnmi && | ||
| 6950 | to_vmx(vcpu)->loaded_vmcs->soft_vnmi_blocked) | ||
| 6951 | return 0; | ||
| 6952 | |||
| 6953 | return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & | ||
| 6954 | (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI | ||
| 6955 | | GUEST_INTR_STATE_NMI)); | ||
| 6956 | } | ||
| 6957 | |||
| 6958 | static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu) | ||
| 6959 | { | ||
| 6960 | return (!to_vmx(vcpu)->nested.nested_run_pending && | ||
| 6961 | vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && | ||
| 6962 | !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & | ||
| 6963 | (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)); | ||
| 6964 | } | ||
| 6965 | |||
| 6966 | static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) | ||
| 6967 | { | ||
| 6968 | int ret; | ||
| 6969 | |||
| 6970 | if (enable_unrestricted_guest) | ||
| 6971 | return 0; | ||
| 6972 | |||
| 6973 | ret = x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, addr, | ||
| 6974 | PAGE_SIZE * 3); | ||
| 6975 | if (ret) | ||
| 6976 | return ret; | ||
| 6977 | to_kvm_vmx(kvm)->tss_addr = addr; | ||
| 6978 | return init_rmode_tss(kvm); | ||
| 6979 | } | ||
| 6980 | |||
| 6981 | static int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr) | ||
| 6982 | { | ||
| 6983 | to_kvm_vmx(kvm)->ept_identity_map_addr = ident_addr; | ||
| 6984 | return 0; | ||
| 6985 | } | ||
| 6986 | |||
| 6987 | static bool rmode_exception(struct kvm_vcpu *vcpu, int vec) | ||
| 6988 | { | ||
| 6989 | switch (vec) { | ||
| 6990 | case BP_VECTOR: | ||
| 6991 | /* | ||
| 6992 | * Update instruction length as we may reinject the exception | ||
| 6993 | * from user space while in guest debugging mode. | ||
| 6994 | */ | ||
| 6995 | to_vmx(vcpu)->vcpu.arch.event_exit_inst_len = | ||
| 6996 | vmcs_read32(VM_EXIT_INSTRUCTION_LEN); | ||
| 6997 | if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) | ||
| 6998 | return false; | ||
| 6999 | /* fall through */ | ||
| 7000 | case DB_VECTOR: | ||
| 7001 | if (vcpu->guest_debug & | ||
| 7002 | (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) | ||
| 7003 | return false; | ||
| 7004 | /* fall through */ | ||
| 7005 | case DE_VECTOR: | ||
| 7006 | case OF_VECTOR: | ||
| 7007 | case BR_VECTOR: | ||
| 7008 | case UD_VECTOR: | ||
| 7009 | case DF_VECTOR: | ||
| 7010 | case SS_VECTOR: | ||
| 7011 | case GP_VECTOR: | ||
| 7012 | case MF_VECTOR: | ||
| 7013 | return true; | ||
| 7014 | break; | ||
| 7015 | } | ||
| 7016 | return false; | ||
| 7017 | } | ||
| 7018 | |||
| 7019 | static int handle_rmode_exception(struct kvm_vcpu *vcpu, | ||
| 7020 | int vec, u32 err_code) | ||
| 7021 | { | ||
| 7022 | /* | ||
| 7023 | * Instruction with address size override prefix opcode 0x67 | ||
| 7024 | * Cause the #SS fault with 0 error code in VM86 mode. | ||
| 7025 | */ | ||
| 7026 | if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) { | ||
| 7027 | if (kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE) { | ||
| 7028 | if (vcpu->arch.halt_request) { | ||
| 7029 | vcpu->arch.halt_request = 0; | ||
| 7030 | return kvm_vcpu_halt(vcpu); | ||
| 7031 | } | ||
| 7032 | return 1; | ||
| 7033 | } | ||
| 7034 | return 0; | ||
| 7035 | } | ||
| 7036 | |||
| 7037 | /* | ||
| 7038 | * Forward all other exceptions that are valid in real mode. | ||
| 7039 | * FIXME: Breaks guest debugging in real mode, needs to be fixed with | ||
| 7040 | * the required debugging infrastructure rework. | ||
| 7041 | */ | ||
| 7042 | kvm_queue_exception(vcpu, vec); | ||
| 7043 | return 1; | ||
| 7044 | } | ||
| 7045 | |||
| 7046 | /* | ||
| 7047 | * Trigger machine check on the host. We assume all the MSRs are already set up | ||
| 7048 | * by the CPU and that we still run on the same CPU as the MCE occurred on. | ||
| 7049 | * We pass a fake environment to the machine check handler because we want | ||
| 7050 | * the guest to be always treated like user space, no matter what context | ||
| 7051 | * it used internally. | ||
| 7052 | */ | ||
| 7053 | static void kvm_machine_check(void) | ||
| 7054 | { | ||
| 7055 | #if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_64) | ||
| 7056 | struct pt_regs regs = { | ||
| 7057 | .cs = 3, /* Fake ring 3 no matter what the guest ran on */ | ||
| 7058 | .flags = X86_EFLAGS_IF, | ||
| 7059 | }; | ||
| 7060 | |||
| 7061 | do_machine_check(®s, 0); | ||
| 7062 | #endif | ||
| 7063 | } | ||
| 7064 | |||
| 7065 | static int handle_machine_check(struct kvm_vcpu *vcpu) | ||
| 7066 | { | ||
| 7067 | /* already handled by vcpu_run */ | ||
| 7068 | return 1; | ||
| 7069 | } | ||
| 7070 | |||
| 7071 | static int handle_exception(struct kvm_vcpu *vcpu) | ||
| 7072 | { | ||
| 7073 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 7074 | struct kvm_run *kvm_run = vcpu->run; | ||
| 7075 | u32 intr_info, ex_no, error_code; | ||
| 7076 | unsigned long cr2, rip, dr6; | ||
| 7077 | u32 vect_info; | ||
| 7078 | enum emulation_result er; | ||
| 7079 | |||
| 7080 | vect_info = vmx->idt_vectoring_info; | ||
| 7081 | intr_info = vmx->exit_intr_info; | ||
| 7082 | |||
| 7083 | if (is_machine_check(intr_info)) | ||
| 7084 | return handle_machine_check(vcpu); | ||
| 7085 | |||
| 7086 | if (is_nmi(intr_info)) | ||
| 7087 | return 1; /* already handled by vmx_vcpu_run() */ | ||
| 7088 | |||
| 7089 | if (is_invalid_opcode(intr_info)) | ||
| 7090 | return handle_ud(vcpu); | ||
| 7091 | |||
| 7092 | error_code = 0; | ||
| 7093 | if (intr_info & INTR_INFO_DELIVER_CODE_MASK) | ||
| 7094 | error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); | ||
| 7095 | |||
| 7096 | if (!vmx->rmode.vm86_active && is_gp_fault(intr_info)) { | ||
| 7097 | WARN_ON_ONCE(!enable_vmware_backdoor); | ||
| 7098 | er = kvm_emulate_instruction(vcpu, | ||
| 7099 | EMULTYPE_VMWARE | EMULTYPE_NO_UD_ON_FAIL); | ||
| 7100 | if (er == EMULATE_USER_EXIT) | ||
| 7101 | return 0; | ||
| 7102 | else if (er != EMULATE_DONE) | ||
| 7103 | kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); | ||
| 7104 | return 1; | ||
| 7105 | } | ||
| 7106 | |||
| 7107 | /* | ||
| 7108 | * The #PF with PFEC.RSVD = 1 indicates the guest is accessing | ||
| 7109 | * MMIO, it is better to report an internal error. | ||
| 7110 | * See the comments in vmx_handle_exit. | ||
| 7111 | */ | ||
| 7112 | if ((vect_info & VECTORING_INFO_VALID_MASK) && | ||
| 7113 | !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) { | ||
| 7114 | vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; | ||
| 7115 | vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX; | ||
| 7116 | vcpu->run->internal.ndata = 3; | ||
| 7117 | vcpu->run->internal.data[0] = vect_info; | ||
| 7118 | vcpu->run->internal.data[1] = intr_info; | ||
| 7119 | vcpu->run->internal.data[2] = error_code; | ||
| 7120 | return 0; | ||
| 7121 | } | ||
| 7122 | |||
| 7123 | if (is_page_fault(intr_info)) { | ||
| 7124 | cr2 = vmcs_readl(EXIT_QUALIFICATION); | ||
| 7125 | /* EPT won't cause page fault directly */ | ||
| 7126 | WARN_ON_ONCE(!vcpu->arch.apf.host_apf_reason && enable_ept); | ||
| 7127 | return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0); | ||
| 7128 | } | ||
| 7129 | |||
| 7130 | ex_no = intr_info & INTR_INFO_VECTOR_MASK; | ||
| 7131 | |||
| 7132 | if (vmx->rmode.vm86_active && rmode_exception(vcpu, ex_no)) | ||
| 7133 | return handle_rmode_exception(vcpu, ex_no, error_code); | ||
| 7134 | |||
| 7135 | switch (ex_no) { | ||
| 7136 | case AC_VECTOR: | ||
| 7137 | kvm_queue_exception_e(vcpu, AC_VECTOR, error_code); | ||
| 7138 | return 1; | ||
| 7139 | case DB_VECTOR: | ||
| 7140 | dr6 = vmcs_readl(EXIT_QUALIFICATION); | ||
| 7141 | if (!(vcpu->guest_debug & | ||
| 7142 | (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) { | ||
| 7143 | vcpu->arch.dr6 &= ~15; | ||
| 7144 | vcpu->arch.dr6 |= dr6 | DR6_RTM; | ||
| 7145 | if (is_icebp(intr_info)) | ||
| 7146 | skip_emulated_instruction(vcpu); | ||
| 7147 | |||
| 7148 | kvm_queue_exception(vcpu, DB_VECTOR); | ||
| 7149 | return 1; | ||
| 7150 | } | ||
| 7151 | kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1; | ||
| 7152 | kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7); | ||
| 7153 | /* fall through */ | ||
| 7154 | case BP_VECTOR: | ||
| 7155 | /* | ||
| 7156 | * Update instruction length as we may reinject #BP from | ||
| 7157 | * user space while in guest debugging mode. Reading it for | ||
| 7158 | * #DB as well causes no harm, it is not used in that case. | ||
| 7159 | */ | ||
| 7160 | vmx->vcpu.arch.event_exit_inst_len = | ||
| 7161 | vmcs_read32(VM_EXIT_INSTRUCTION_LEN); | ||
| 7162 | kvm_run->exit_reason = KVM_EXIT_DEBUG; | ||
| 7163 | rip = kvm_rip_read(vcpu); | ||
| 7164 | kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip; | ||
| 7165 | kvm_run->debug.arch.exception = ex_no; | ||
| 7166 | break; | ||
| 7167 | default: | ||
| 7168 | kvm_run->exit_reason = KVM_EXIT_EXCEPTION; | ||
| 7169 | kvm_run->ex.exception = ex_no; | ||
| 7170 | kvm_run->ex.error_code = error_code; | ||
| 7171 | break; | ||
| 7172 | } | ||
| 7173 | return 0; | ||
| 7174 | } | ||
| 7175 | |||
| 7176 | static int handle_external_interrupt(struct kvm_vcpu *vcpu) | ||
| 7177 | { | ||
| 7178 | ++vcpu->stat.irq_exits; | ||
| 7179 | return 1; | ||
| 7180 | } | ||
| 7181 | |||
| 7182 | static int handle_triple_fault(struct kvm_vcpu *vcpu) | ||
| 7183 | { | ||
| 7184 | vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; | ||
| 7185 | vcpu->mmio_needed = 0; | ||
| 7186 | return 0; | ||
| 7187 | } | ||
| 7188 | |||
| 7189 | static int handle_io(struct kvm_vcpu *vcpu) | ||
| 7190 | { | ||
| 7191 | unsigned long exit_qualification; | ||
| 7192 | int size, in, string; | ||
| 7193 | unsigned port; | ||
| 7194 | |||
| 7195 | exit_qualification = vmcs_readl(EXIT_QUALIFICATION); | ||
| 7196 | string = (exit_qualification & 16) != 0; | ||
| 7197 | |||
| 7198 | ++vcpu->stat.io_exits; | ||
| 7199 | |||
| 7200 | if (string) | ||
| 7201 | return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE; | ||
| 7202 | |||
| 7203 | port = exit_qualification >> 16; | ||
| 7204 | size = (exit_qualification & 7) + 1; | ||
| 7205 | in = (exit_qualification & 8) != 0; | ||
| 7206 | |||
| 7207 | return kvm_fast_pio(vcpu, size, port, in); | ||
| 7208 | } | ||
| 7209 | |||
| 7210 | static void | ||
| 7211 | vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) | ||
| 7212 | { | ||
| 7213 | /* | ||
| 7214 | * Patch in the VMCALL instruction: | ||
| 7215 | */ | ||
| 7216 | hypercall[0] = 0x0f; | ||
| 7217 | hypercall[1] = 0x01; | ||
| 7218 | hypercall[2] = 0xc1; | ||
| 7219 | } | ||
| 7220 | |||
| 7221 | /* called to set cr0 as appropriate for a mov-to-cr0 exit. */ | ||
| 7222 | static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val) | ||
| 7223 | { | ||
| 7224 | if (is_guest_mode(vcpu)) { | ||
| 7225 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); | ||
| 7226 | unsigned long orig_val = val; | ||
| 7227 | |||
| 7228 | /* | ||
| 7229 | * We get here when L2 changed cr0 in a way that did not change | ||
| 7230 | * any of L1's shadowed bits (see nested_vmx_exit_handled_cr), | ||
| 7231 | * but did change L0 shadowed bits. So we first calculate the | ||
| 7232 | * effective cr0 value that L1 would like to write into the | ||
| 7233 | * hardware. It consists of the L2-owned bits from the new | ||
| 7234 | * value combined with the L1-owned bits from L1's guest_cr0. | ||
| 7235 | */ | ||
| 7236 | val = (val & ~vmcs12->cr0_guest_host_mask) | | ||
| 7237 | (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask); | ||
| 7238 | |||
| 7239 | if (!nested_guest_cr0_valid(vcpu, val)) | ||
| 7240 | return 1; | ||
| 7241 | |||
| 7242 | if (kvm_set_cr0(vcpu, val)) | ||
| 7243 | return 1; | ||
| 7244 | vmcs_writel(CR0_READ_SHADOW, orig_val); | ||
| 7245 | return 0; | ||
| 7246 | } else { | ||
| 7247 | if (to_vmx(vcpu)->nested.vmxon && | ||
| 7248 | !nested_host_cr0_valid(vcpu, val)) | ||
| 7249 | return 1; | ||
| 7250 | |||
| 7251 | return kvm_set_cr0(vcpu, val); | ||
| 7252 | } | ||
| 7253 | } | ||
| 7254 | |||
| 7255 | static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val) | ||
| 7256 | { | ||
| 7257 | if (is_guest_mode(vcpu)) { | ||
| 7258 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); | ||
| 7259 | unsigned long orig_val = val; | ||
| 7260 | |||
| 7261 | /* analogously to handle_set_cr0 */ | ||
| 7262 | val = (val & ~vmcs12->cr4_guest_host_mask) | | ||
| 7263 | (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask); | ||
| 7264 | if (kvm_set_cr4(vcpu, val)) | ||
| 7265 | return 1; | ||
| 7266 | vmcs_writel(CR4_READ_SHADOW, orig_val); | ||
| 7267 | return 0; | ||
| 7268 | } else | ||
| 7269 | return kvm_set_cr4(vcpu, val); | ||
| 7270 | } | ||
| 7271 | |||
| 7272 | static int handle_desc(struct kvm_vcpu *vcpu) | ||
| 7273 | { | ||
| 7274 | WARN_ON(!(vcpu->arch.cr4 & X86_CR4_UMIP)); | ||
| 7275 | return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE; | ||
| 7276 | } | ||
| 7277 | |||
| 7278 | static int handle_cr(struct kvm_vcpu *vcpu) | ||
| 7279 | { | ||
| 7280 | unsigned long exit_qualification, val; | ||
| 7281 | int cr; | ||
| 7282 | int reg; | ||
| 7283 | int err; | ||
| 7284 | int ret; | ||
| 7285 | |||
| 7286 | exit_qualification = vmcs_readl(EXIT_QUALIFICATION); | ||
| 7287 | cr = exit_qualification & 15; | ||
| 7288 | reg = (exit_qualification >> 8) & 15; | ||
| 7289 | switch ((exit_qualification >> 4) & 3) { | ||
| 7290 | case 0: /* mov to cr */ | ||
| 7291 | val = kvm_register_readl(vcpu, reg); | ||
| 7292 | trace_kvm_cr_write(cr, val); | ||
| 7293 | switch (cr) { | ||
| 7294 | case 0: | ||
| 7295 | err = handle_set_cr0(vcpu, val); | ||
| 7296 | return kvm_complete_insn_gp(vcpu, err); | ||
| 7297 | case 3: | ||
| 7298 | WARN_ON_ONCE(enable_unrestricted_guest); | ||
| 7299 | err = kvm_set_cr3(vcpu, val); | ||
| 7300 | return kvm_complete_insn_gp(vcpu, err); | ||
| 7301 | case 4: | ||
| 7302 | err = handle_set_cr4(vcpu, val); | ||
| 7303 | return kvm_complete_insn_gp(vcpu, err); | ||
| 7304 | case 8: { | ||
| 7305 | u8 cr8_prev = kvm_get_cr8(vcpu); | ||
| 7306 | u8 cr8 = (u8)val; | ||
| 7307 | err = kvm_set_cr8(vcpu, cr8); | ||
| 7308 | ret = kvm_complete_insn_gp(vcpu, err); | ||
| 7309 | if (lapic_in_kernel(vcpu)) | ||
| 7310 | return ret; | ||
| 7311 | if (cr8_prev <= cr8) | ||
| 7312 | return ret; | ||
| 7313 | /* | ||
| 7314 | * TODO: we might be squashing a | ||
| 7315 | * KVM_GUESTDBG_SINGLESTEP-triggered | ||
| 7316 | * KVM_EXIT_DEBUG here. | ||
| 7317 | */ | ||
| 7318 | vcpu->run->exit_reason = KVM_EXIT_SET_TPR; | ||
| 7319 | return 0; | ||
| 7320 | } | ||
| 7321 | } | ||
| 7322 | break; | ||
| 7323 | case 2: /* clts */ | ||
| 7324 | WARN_ONCE(1, "Guest should always own CR0.TS"); | ||
| 7325 | vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); | ||
| 7326 | trace_kvm_cr_write(0, kvm_read_cr0(vcpu)); | ||
| 7327 | return kvm_skip_emulated_instruction(vcpu); | ||
| 7328 | case 1: /*mov from cr*/ | ||
| 7329 | switch (cr) { | ||
| 7330 | case 3: | ||
| 7331 | WARN_ON_ONCE(enable_unrestricted_guest); | ||
| 7332 | val = kvm_read_cr3(vcpu); | ||
| 7333 | kvm_register_write(vcpu, reg, val); | ||
| 7334 | trace_kvm_cr_read(cr, val); | ||
| 7335 | return kvm_skip_emulated_instruction(vcpu); | ||
| 7336 | case 8: | ||
| 7337 | val = kvm_get_cr8(vcpu); | ||
| 7338 | kvm_register_write(vcpu, reg, val); | ||
| 7339 | trace_kvm_cr_read(cr, val); | ||
| 7340 | return kvm_skip_emulated_instruction(vcpu); | ||
| 7341 | } | ||
| 7342 | break; | ||
| 7343 | case 3: /* lmsw */ | ||
| 7344 | val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f; | ||
| 7345 | trace_kvm_cr_write(0, (kvm_read_cr0(vcpu) & ~0xful) | val); | ||
| 7346 | kvm_lmsw(vcpu, val); | ||
| 7347 | |||
| 7348 | return kvm_skip_emulated_instruction(vcpu); | ||
| 7349 | default: | ||
| 7350 | break; | ||
| 7351 | } | ||
| 7352 | vcpu->run->exit_reason = 0; | ||
| 7353 | vcpu_unimpl(vcpu, "unhandled control register: op %d cr %d\n", | ||
| 7354 | (int)(exit_qualification >> 4) & 3, cr); | ||
| 7355 | return 0; | ||
| 7356 | } | ||
| 7357 | |||
| 7358 | static int handle_dr(struct kvm_vcpu *vcpu) | ||
| 7359 | { | ||
| 7360 | unsigned long exit_qualification; | ||
| 7361 | int dr, dr7, reg; | ||
| 7362 | |||
| 7363 | exit_qualification = vmcs_readl(EXIT_QUALIFICATION); | ||
| 7364 | dr = exit_qualification & DEBUG_REG_ACCESS_NUM; | ||
| 7365 | |||
| 7366 | /* First, if DR does not exist, trigger UD */ | ||
| 7367 | if (!kvm_require_dr(vcpu, dr)) | ||
| 7368 | return 1; | ||
| 7369 | |||
| 7370 | /* Do not handle if the CPL > 0, will trigger GP on re-entry */ | ||
| 7371 | if (!kvm_require_cpl(vcpu, 0)) | ||
| 7372 | return 1; | ||
| 7373 | dr7 = vmcs_readl(GUEST_DR7); | ||
| 7374 | if (dr7 & DR7_GD) { | ||
| 7375 | /* | ||
| 7376 | * As the vm-exit takes precedence over the debug trap, we | ||
| 7377 | * need to emulate the latter, either for the host or the | ||
| 7378 | * guest debugging itself. | ||
| 7379 | */ | ||
| 7380 | if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) { | ||
| 7381 | vcpu->run->debug.arch.dr6 = vcpu->arch.dr6; | ||
| 7382 | vcpu->run->debug.arch.dr7 = dr7; | ||
| 7383 | vcpu->run->debug.arch.pc = kvm_get_linear_rip(vcpu); | ||
| 7384 | vcpu->run->debug.arch.exception = DB_VECTOR; | ||
| 7385 | vcpu->run->exit_reason = KVM_EXIT_DEBUG; | ||
| 7386 | return 0; | ||
| 7387 | } else { | ||
| 7388 | vcpu->arch.dr6 &= ~15; | ||
| 7389 | vcpu->arch.dr6 |= DR6_BD | DR6_RTM; | ||
| 7390 | kvm_queue_exception(vcpu, DB_VECTOR); | ||
| 7391 | return 1; | ||
| 7392 | } | ||
| 7393 | } | ||
| 7394 | |||
| 7395 | if (vcpu->guest_debug == 0) { | ||
| 7396 | vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL, | ||
| 7397 | CPU_BASED_MOV_DR_EXITING); | ||
| 7398 | |||
| 7399 | /* | ||
| 7400 | * No more DR vmexits; force a reload of the debug registers | ||
| 7401 | * and reenter on this instruction. The next vmexit will | ||
| 7402 | * retrieve the full state of the debug registers. | ||
| 7403 | */ | ||
| 7404 | vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT; | ||
| 7405 | return 1; | ||
| 7406 | } | ||
| 7407 | |||
| 7408 | reg = DEBUG_REG_ACCESS_REG(exit_qualification); | ||
| 7409 | if (exit_qualification & TYPE_MOV_FROM_DR) { | ||
| 7410 | unsigned long val; | ||
| 7411 | |||
| 7412 | if (kvm_get_dr(vcpu, dr, &val)) | ||
| 7413 | return 1; | ||
| 7414 | kvm_register_write(vcpu, reg, val); | ||
| 7415 | } else | ||
| 7416 | if (kvm_set_dr(vcpu, dr, kvm_register_readl(vcpu, reg))) | ||
| 7417 | return 1; | ||
| 7418 | |||
| 7419 | return kvm_skip_emulated_instruction(vcpu); | ||
| 7420 | } | ||
| 7421 | |||
| 7422 | static u64 vmx_get_dr6(struct kvm_vcpu *vcpu) | ||
| 7423 | { | ||
| 7424 | return vcpu->arch.dr6; | ||
| 7425 | } | ||
| 7426 | |||
| 7427 | static void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val) | ||
| 7428 | { | ||
| 7429 | } | ||
| 7430 | |||
| 7431 | static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) | ||
| 7432 | { | ||
| 7433 | get_debugreg(vcpu->arch.db[0], 0); | ||
| 7434 | get_debugreg(vcpu->arch.db[1], 1); | ||
| 7435 | get_debugreg(vcpu->arch.db[2], 2); | ||
| 7436 | get_debugreg(vcpu->arch.db[3], 3); | ||
| 7437 | get_debugreg(vcpu->arch.dr6, 6); | ||
| 7438 | vcpu->arch.dr7 = vmcs_readl(GUEST_DR7); | ||
| 7439 | |||
| 7440 | vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT; | ||
| 7441 | vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL, CPU_BASED_MOV_DR_EXITING); | ||
| 7442 | } | ||
| 7443 | |||
| 7444 | static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val) | ||
| 7445 | { | ||
| 7446 | vmcs_writel(GUEST_DR7, val); | ||
| 7447 | } | ||
| 7448 | |||
| 7449 | static int handle_cpuid(struct kvm_vcpu *vcpu) | ||
| 7450 | { | ||
| 7451 | return kvm_emulate_cpuid(vcpu); | ||
| 7452 | } | ||
| 7453 | |||
| 7454 | static int handle_rdmsr(struct kvm_vcpu *vcpu) | ||
| 7455 | { | ||
| 7456 | u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX]; | ||
| 7457 | struct msr_data msr_info; | ||
| 7458 | |||
| 7459 | msr_info.index = ecx; | ||
| 7460 | msr_info.host_initiated = false; | ||
| 7461 | if (vmx_get_msr(vcpu, &msr_info)) { | ||
| 7462 | trace_kvm_msr_read_ex(ecx); | ||
| 7463 | kvm_inject_gp(vcpu, 0); | ||
| 7464 | return 1; | ||
| 7465 | } | ||
| 7466 | |||
| 7467 | trace_kvm_msr_read(ecx, msr_info.data); | ||
| 7468 | |||
| 7469 | /* FIXME: handling of bits 32:63 of rax, rdx */ | ||
| 7470 | vcpu->arch.regs[VCPU_REGS_RAX] = msr_info.data & -1u; | ||
| 7471 | vcpu->arch.regs[VCPU_REGS_RDX] = (msr_info.data >> 32) & -1u; | ||
| 7472 | return kvm_skip_emulated_instruction(vcpu); | ||
| 7473 | } | ||
| 7474 | |||
| 7475 | static int handle_wrmsr(struct kvm_vcpu *vcpu) | ||
| 7476 | { | ||
| 7477 | struct msr_data msr; | ||
| 7478 | u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX]; | ||
| 7479 | u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u) | ||
| 7480 | | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32); | ||
| 7481 | |||
| 7482 | msr.data = data; | ||
| 7483 | msr.index = ecx; | ||
| 7484 | msr.host_initiated = false; | ||
| 7485 | if (kvm_set_msr(vcpu, &msr) != 0) { | ||
| 7486 | trace_kvm_msr_write_ex(ecx, data); | ||
| 7487 | kvm_inject_gp(vcpu, 0); | ||
| 7488 | return 1; | ||
| 7489 | } | ||
| 7490 | |||
| 7491 | trace_kvm_msr_write(ecx, data); | ||
| 7492 | return kvm_skip_emulated_instruction(vcpu); | ||
| 7493 | } | ||
| 7494 | |||
| 7495 | static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu) | ||
| 7496 | { | ||
| 7497 | kvm_apic_update_ppr(vcpu); | ||
| 7498 | return 1; | ||
| 7499 | } | ||
| 7500 | |||
| 7501 | static int handle_interrupt_window(struct kvm_vcpu *vcpu) | ||
| 7502 | { | ||
| 7503 | vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL, | ||
| 7504 | CPU_BASED_VIRTUAL_INTR_PENDING); | ||
| 7505 | |||
| 7506 | kvm_make_request(KVM_REQ_EVENT, vcpu); | ||
| 7507 | |||
| 7508 | ++vcpu->stat.irq_window_exits; | ||
| 7509 | return 1; | ||
| 7510 | } | ||
| 7511 | |||
| 7512 | static int handle_halt(struct kvm_vcpu *vcpu) | ||
| 7513 | { | ||
| 7514 | return kvm_emulate_halt(vcpu); | ||
| 7515 | } | ||
| 7516 | |||
| 7517 | static int handle_vmcall(struct kvm_vcpu *vcpu) | ||
| 7518 | { | ||
| 7519 | return kvm_emulate_hypercall(vcpu); | ||
| 7520 | } | ||
| 7521 | |||
| 7522 | static int handle_invd(struct kvm_vcpu *vcpu) | ||
| 7523 | { | ||
| 7524 | return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE; | ||
| 7525 | } | ||
| 7526 | |||
| 7527 | static int handle_invlpg(struct kvm_vcpu *vcpu) | ||
| 7528 | { | ||
| 7529 | unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); | ||
| 7530 | |||
| 7531 | kvm_mmu_invlpg(vcpu, exit_qualification); | ||
| 7532 | return kvm_skip_emulated_instruction(vcpu); | ||
| 7533 | } | ||
| 7534 | |||
| 7535 | static int handle_rdpmc(struct kvm_vcpu *vcpu) | ||
| 7536 | { | ||
| 7537 | int err; | ||
| 7538 | |||
| 7539 | err = kvm_rdpmc(vcpu); | ||
| 7540 | return kvm_complete_insn_gp(vcpu, err); | ||
| 7541 | } | ||
| 7542 | |||
| 7543 | static int handle_wbinvd(struct kvm_vcpu *vcpu) | ||
| 7544 | { | ||
| 7545 | return kvm_emulate_wbinvd(vcpu); | ||
| 7546 | } | ||
| 7547 | |||
| 7548 | static int handle_xsetbv(struct kvm_vcpu *vcpu) | ||
| 7549 | { | ||
| 7550 | u64 new_bv = kvm_read_edx_eax(vcpu); | ||
| 7551 | u32 index = kvm_register_read(vcpu, VCPU_REGS_RCX); | ||
| 7552 | |||
| 7553 | if (kvm_set_xcr(vcpu, index, new_bv) == 0) | ||
| 7554 | return kvm_skip_emulated_instruction(vcpu); | ||
| 7555 | return 1; | ||
| 7556 | } | ||
| 7557 | |||
| 7558 | static int handle_xsaves(struct kvm_vcpu *vcpu) | ||
| 7559 | { | ||
| 7560 | kvm_skip_emulated_instruction(vcpu); | ||
| 7561 | WARN(1, "this should never happen\n"); | ||
| 7562 | return 1; | ||
| 7563 | } | ||
| 7564 | |||
| 7565 | static int handle_xrstors(struct kvm_vcpu *vcpu) | ||
| 7566 | { | ||
| 7567 | kvm_skip_emulated_instruction(vcpu); | ||
| 7568 | WARN(1, "this should never happen\n"); | ||
| 7569 | return 1; | ||
| 7570 | } | ||
| 7571 | |||
| 7572 | static int handle_apic_access(struct kvm_vcpu *vcpu) | ||
| 7573 | { | ||
| 7574 | if (likely(fasteoi)) { | ||
| 7575 | unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); | ||
| 7576 | int access_type, offset; | ||
| 7577 | |||
| 7578 | access_type = exit_qualification & APIC_ACCESS_TYPE; | ||
| 7579 | offset = exit_qualification & APIC_ACCESS_OFFSET; | ||
| 7580 | /* | ||
| 7581 | * Sane guest uses MOV to write EOI, with written value | ||
| 7582 | * not cared. So make a short-circuit here by avoiding | ||
| 7583 | * heavy instruction emulation. | ||
| 7584 | */ | ||
| 7585 | if ((access_type == TYPE_LINEAR_APIC_INST_WRITE) && | ||
| 7586 | (offset == APIC_EOI)) { | ||
| 7587 | kvm_lapic_set_eoi(vcpu); | ||
| 7588 | return kvm_skip_emulated_instruction(vcpu); | ||
| 7589 | } | ||
| 7590 | } | ||
| 7591 | return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE; | ||
| 7592 | } | ||
| 7593 | |||
| 7594 | static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu) | ||
| 7595 | { | ||
| 7596 | unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); | ||
| 7597 | int vector = exit_qualification & 0xff; | ||
| 7598 | |||
| 7599 | /* EOI-induced VM exit is trap-like and thus no need to adjust IP */ | ||
| 7600 | kvm_apic_set_eoi_accelerated(vcpu, vector); | ||
| 7601 | return 1; | ||
| 7602 | } | ||
| 7603 | |||
| 7604 | static int handle_apic_write(struct kvm_vcpu *vcpu) | ||
| 7605 | { | ||
| 7606 | unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); | ||
| 7607 | u32 offset = exit_qualification & 0xfff; | ||
| 7608 | |||
| 7609 | /* APIC-write VM exit is trap-like and thus no need to adjust IP */ | ||
| 7610 | kvm_apic_write_nodecode(vcpu, offset); | ||
| 7611 | return 1; | ||
| 7612 | } | ||
| 7613 | |||
| 7614 | static int handle_task_switch(struct kvm_vcpu *vcpu) | ||
| 7615 | { | ||
| 7616 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 7617 | unsigned long exit_qualification; | ||
| 7618 | bool has_error_code = false; | ||
| 7619 | u32 error_code = 0; | ||
| 7620 | u16 tss_selector; | ||
| 7621 | int reason, type, idt_v, idt_index; | ||
| 7622 | |||
| 7623 | idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK); | ||
| 7624 | idt_index = (vmx->idt_vectoring_info & VECTORING_INFO_VECTOR_MASK); | ||
| 7625 | type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK); | ||
| 7626 | |||
| 7627 | exit_qualification = vmcs_readl(EXIT_QUALIFICATION); | ||
| 7628 | |||
| 7629 | reason = (u32)exit_qualification >> 30; | ||
| 7630 | if (reason == TASK_SWITCH_GATE && idt_v) { | ||
| 7631 | switch (type) { | ||
| 7632 | case INTR_TYPE_NMI_INTR: | ||
| 7633 | vcpu->arch.nmi_injected = false; | ||
| 7634 | vmx_set_nmi_mask(vcpu, true); | ||
| 7635 | break; | ||
| 7636 | case INTR_TYPE_EXT_INTR: | ||
| 7637 | case INTR_TYPE_SOFT_INTR: | ||
| 7638 | kvm_clear_interrupt_queue(vcpu); | ||
| 7639 | break; | ||
| 7640 | case INTR_TYPE_HARD_EXCEPTION: | ||
| 7641 | if (vmx->idt_vectoring_info & | ||
| 7642 | VECTORING_INFO_DELIVER_CODE_MASK) { | ||
| 7643 | has_error_code = true; | ||
| 7644 | error_code = | ||
| 7645 | vmcs_read32(IDT_VECTORING_ERROR_CODE); | ||
| 7646 | } | ||
| 7647 | /* fall through */ | ||
| 7648 | case INTR_TYPE_SOFT_EXCEPTION: | ||
| 7649 | kvm_clear_exception_queue(vcpu); | ||
| 7650 | break; | ||
| 7651 | default: | ||
| 7652 | break; | ||
| 7653 | } | ||
| 7654 | } | ||
| 7655 | tss_selector = exit_qualification; | ||
| 7656 | |||
| 7657 | if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION && | ||
| 7658 | type != INTR_TYPE_EXT_INTR && | ||
| 7659 | type != INTR_TYPE_NMI_INTR)) | ||
| 7660 | skip_emulated_instruction(vcpu); | ||
| 7661 | |||
| 7662 | if (kvm_task_switch(vcpu, tss_selector, | ||
| 7663 | type == INTR_TYPE_SOFT_INTR ? idt_index : -1, reason, | ||
| 7664 | has_error_code, error_code) == EMULATE_FAIL) { | ||
| 7665 | vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; | ||
| 7666 | vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; | ||
| 7667 | vcpu->run->internal.ndata = 0; | ||
| 7668 | return 0; | ||
| 7669 | } | ||
| 7670 | |||
| 7671 | /* | ||
| 7672 | * TODO: What about debug traps on tss switch? | ||
| 7673 | * Are we supposed to inject them and update dr6? | ||
| 7674 | */ | ||
| 7675 | |||
| 7676 | return 1; | ||
| 7677 | } | ||
| 7678 | |||
| 7679 | static int handle_ept_violation(struct kvm_vcpu *vcpu) | ||
| 7680 | { | ||
| 7681 | unsigned long exit_qualification; | ||
| 7682 | gpa_t gpa; | ||
| 7683 | u64 error_code; | ||
| 7684 | |||
| 7685 | exit_qualification = vmcs_readl(EXIT_QUALIFICATION); | ||
| 7686 | |||
| 7687 | /* | ||
| 7688 | * EPT violation happened while executing iret from NMI, | ||
| 7689 | * "blocked by NMI" bit has to be set before next VM entry. | ||
| 7690 | * There are errata that may cause this bit to not be set: | ||
| 7691 | * AAK134, BY25. | ||
| 7692 | */ | ||
| 7693 | if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && | ||
| 7694 | enable_vnmi && | ||
| 7695 | (exit_qualification & INTR_INFO_UNBLOCK_NMI)) | ||
| 7696 | vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI); | ||
| 7697 | |||
| 7698 | gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); | ||
| 7699 | trace_kvm_page_fault(gpa, exit_qualification); | ||
| 7700 | |||
| 7701 | /* Is it a read fault? */ | ||
| 7702 | error_code = (exit_qualification & EPT_VIOLATION_ACC_READ) | ||
| 7703 | ? PFERR_USER_MASK : 0; | ||
| 7704 | /* Is it a write fault? */ | ||
| 7705 | error_code |= (exit_qualification & EPT_VIOLATION_ACC_WRITE) | ||
| 7706 | ? PFERR_WRITE_MASK : 0; | ||
| 7707 | /* Is it a fetch fault? */ | ||
| 7708 | error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR) | ||
| 7709 | ? PFERR_FETCH_MASK : 0; | ||
| 7710 | /* ept page table entry is present? */ | ||
| 7711 | error_code |= (exit_qualification & | ||
| 7712 | (EPT_VIOLATION_READABLE | EPT_VIOLATION_WRITABLE | | ||
| 7713 | EPT_VIOLATION_EXECUTABLE)) | ||
| 7714 | ? PFERR_PRESENT_MASK : 0; | ||
| 7715 | |||
| 7716 | error_code |= (exit_qualification & 0x100) != 0 ? | ||
| 7717 | PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK; | ||
| 7718 | |||
| 7719 | vcpu->arch.exit_qualification = exit_qualification; | ||
| 7720 | return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0); | ||
| 7721 | } | ||
| 7722 | |||
| 7723 | static int handle_ept_misconfig(struct kvm_vcpu *vcpu) | ||
| 7724 | { | ||
| 7725 | gpa_t gpa; | ||
| 7726 | |||
| 7727 | /* | ||
| 7728 | * A nested guest cannot optimize MMIO vmexits, because we have an | ||
| 7729 | * nGPA here instead of the required GPA. | ||
| 7730 | */ | ||
| 7731 | gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); | ||
| 7732 | if (!is_guest_mode(vcpu) && | ||
| 7733 | !kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) { | ||
| 7734 | trace_kvm_fast_mmio(gpa); | ||
| 7735 | /* | ||
| 7736 | * Doing kvm_skip_emulated_instruction() depends on undefined | ||
| 7737 | * behavior: Intel's manual doesn't mandate | ||
| 7738 | * VM_EXIT_INSTRUCTION_LEN to be set in VMCS when EPT MISCONFIG | ||
| 7739 | * occurs and while on real hardware it was observed to be set, | ||
| 7740 | * other hypervisors (namely Hyper-V) don't set it, we end up | ||
| 7741 | * advancing IP with some random value. Disable fast mmio when | ||
| 7742 | * running nested and keep it for real hardware in hope that | ||
| 7743 | * VM_EXIT_INSTRUCTION_LEN will always be set correctly. | ||
| 7744 | */ | ||
| 7745 | if (!static_cpu_has(X86_FEATURE_HYPERVISOR)) | ||
| 7746 | return kvm_skip_emulated_instruction(vcpu); | ||
| 7747 | else | ||
| 7748 | return kvm_emulate_instruction(vcpu, EMULTYPE_SKIP) == | ||
| 7749 | EMULATE_DONE; | ||
| 7750 | } | ||
| 7751 | |||
| 7752 | return kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0); | ||
| 7753 | } | ||
| 7754 | |||
| 7755 | static int handle_nmi_window(struct kvm_vcpu *vcpu) | ||
| 7756 | { | ||
| 7757 | WARN_ON_ONCE(!enable_vnmi); | ||
| 7758 | vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL, | ||
| 7759 | CPU_BASED_VIRTUAL_NMI_PENDING); | ||
| 7760 | ++vcpu->stat.nmi_window_exits; | ||
| 7761 | kvm_make_request(KVM_REQ_EVENT, vcpu); | ||
| 7762 | |||
| 7763 | return 1; | ||
| 7764 | } | ||
| 7765 | |||
| 7766 | static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) | ||
| 7767 | { | ||
| 7768 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 7769 | enum emulation_result err = EMULATE_DONE; | ||
| 7770 | int ret = 1; | ||
| 7771 | u32 cpu_exec_ctrl; | ||
| 7772 | bool intr_window_requested; | ||
| 7773 | unsigned count = 130; | ||
| 7774 | |||
| 7775 | /* | ||
| 7776 | * We should never reach the point where we are emulating L2 | ||
| 7777 | * due to invalid guest state as that means we incorrectly | ||
| 7778 | * allowed a nested VMEntry with an invalid vmcs12. | ||
| 7779 | */ | ||
| 7780 | WARN_ON_ONCE(vmx->emulation_required && vmx->nested.nested_run_pending); | ||
| 7781 | |||
| 7782 | cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); | ||
| 7783 | intr_window_requested = cpu_exec_ctrl & CPU_BASED_VIRTUAL_INTR_PENDING; | ||
| 7784 | |||
| 7785 | while (vmx->emulation_required && count-- != 0) { | ||
| 7786 | if (intr_window_requested && vmx_interrupt_allowed(vcpu)) | ||
| 7787 | return handle_interrupt_window(&vmx->vcpu); | ||
| 7788 | |||
| 7789 | if (kvm_test_request(KVM_REQ_EVENT, vcpu)) | ||
| 7790 | return 1; | ||
| 7791 | |||
| 7792 | err = kvm_emulate_instruction(vcpu, 0); | ||
| 7793 | |||
| 7794 | if (err == EMULATE_USER_EXIT) { | ||
| 7795 | ++vcpu->stat.mmio_exits; | ||
| 7796 | ret = 0; | ||
| 7797 | goto out; | ||
| 7798 | } | ||
| 7799 | |||
| 7800 | if (err != EMULATE_DONE) | ||
| 7801 | goto emulation_error; | ||
| 7802 | |||
| 7803 | if (vmx->emulation_required && !vmx->rmode.vm86_active && | ||
| 7804 | vcpu->arch.exception.pending) | ||
| 7805 | goto emulation_error; | ||
| 7806 | |||
| 7807 | if (vcpu->arch.halt_request) { | ||
| 7808 | vcpu->arch.halt_request = 0; | ||
| 7809 | ret = kvm_vcpu_halt(vcpu); | ||
| 7810 | goto out; | ||
| 7811 | } | ||
| 7812 | |||
| 7813 | if (signal_pending(current)) | ||
| 7814 | goto out; | ||
| 7815 | if (need_resched()) | ||
| 7816 | schedule(); | ||
| 7817 | } | ||
| 7818 | |||
| 7819 | out: | ||
| 7820 | return ret; | ||
| 7821 | |||
| 7822 | emulation_error: | ||
| 7823 | vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; | ||
| 7824 | vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; | ||
| 7825 | vcpu->run->internal.ndata = 0; | ||
| 7826 | return 0; | ||
| 7827 | } | ||
| 7828 | |||
| 7829 | static void grow_ple_window(struct kvm_vcpu *vcpu) | ||
| 7830 | { | ||
| 7831 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 7832 | int old = vmx->ple_window; | ||
| 7833 | |||
| 7834 | vmx->ple_window = __grow_ple_window(old, ple_window, | ||
| 7835 | ple_window_grow, | ||
| 7836 | ple_window_max); | ||
| 7837 | |||
| 7838 | if (vmx->ple_window != old) | ||
| 7839 | vmx->ple_window_dirty = true; | ||
| 7840 | |||
| 7841 | trace_kvm_ple_window_grow(vcpu->vcpu_id, vmx->ple_window, old); | ||
| 7842 | } | ||
| 7843 | |||
| 7844 | static void shrink_ple_window(struct kvm_vcpu *vcpu) | ||
| 7845 | { | ||
| 7846 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 7847 | int old = vmx->ple_window; | ||
| 7848 | |||
| 7849 | vmx->ple_window = __shrink_ple_window(old, ple_window, | ||
| 7850 | ple_window_shrink, | ||
| 7851 | ple_window); | ||
| 7852 | |||
| 7853 | if (vmx->ple_window != old) | ||
| 7854 | vmx->ple_window_dirty = true; | ||
| 7855 | |||
| 7856 | trace_kvm_ple_window_shrink(vcpu->vcpu_id, vmx->ple_window, old); | ||
| 7857 | } | ||
| 7858 | |||
| 7859 | /* | ||
| 7860 | * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR. | ||
| 7861 | */ | ||
| 7862 | static void wakeup_handler(void) | ||
| 7863 | { | ||
| 7864 | struct kvm_vcpu *vcpu; | ||
| 7865 | int cpu = smp_processor_id(); | ||
| 7866 | |||
| 7867 | spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); | ||
| 7868 | list_for_each_entry(vcpu, &per_cpu(blocked_vcpu_on_cpu, cpu), | ||
| 7869 | blocked_vcpu_list) { | ||
| 7870 | struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); | ||
| 7871 | |||
| 7872 | if (pi_test_on(pi_desc) == 1) | ||
| 7873 | kvm_vcpu_kick(vcpu); | ||
| 7874 | } | ||
| 7875 | spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); | ||
| 7876 | } | ||
| 7877 | |||
| 7878 | static void vmx_enable_tdp(void) | ||
| 7879 | { | ||
| 7880 | kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK, | ||
| 7881 | enable_ept_ad_bits ? VMX_EPT_ACCESS_BIT : 0ull, | ||
| 7882 | enable_ept_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull, | ||
| 7883 | 0ull, VMX_EPT_EXECUTABLE_MASK, | ||
| 7884 | cpu_has_vmx_ept_execute_only() ? 0ull : VMX_EPT_READABLE_MASK, | ||
| 7885 | VMX_EPT_RWX_MASK, 0ull); | ||
| 7886 | |||
| 7887 | ept_set_mmio_spte_mask(); | ||
| 7888 | kvm_enable_tdp(); | ||
| 7889 | } | ||
| 7890 | |||
| 7891 | static __init int hardware_setup(void) | ||
| 7892 | { | ||
| 7893 | unsigned long host_bndcfgs; | ||
| 7894 | int r = -ENOMEM, i; | ||
| 7895 | |||
| 7896 | rdmsrl_safe(MSR_EFER, &host_efer); | ||
| 7897 | |||
| 7898 | for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) | ||
| 7899 | kvm_define_shared_msr(i, vmx_msr_index[i]); | ||
| 7900 | |||
| 7901 | for (i = 0; i < VMX_BITMAP_NR; i++) { | ||
| 7902 | vmx_bitmap[i] = (unsigned long *)__get_free_page(GFP_KERNEL); | ||
| 7903 | if (!vmx_bitmap[i]) | ||
| 7904 | goto out; | ||
| 7905 | } | ||
| 7906 | |||
| 7907 | memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); | ||
| 7908 | memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); | ||
| 7909 | |||
| 7910 | if (setup_vmcs_config(&vmcs_config) < 0) { | ||
| 7911 | r = -EIO; | ||
| 7912 | goto out; | ||
| 7913 | } | ||
| 7914 | |||
| 7915 | if (boot_cpu_has(X86_FEATURE_NX)) | ||
| 7916 | kvm_enable_efer_bits(EFER_NX); | ||
| 7917 | |||
| 7918 | if (boot_cpu_has(X86_FEATURE_MPX)) { | ||
| 7919 | rdmsrl(MSR_IA32_BNDCFGS, host_bndcfgs); | ||
| 7920 | WARN_ONCE(host_bndcfgs, "KVM: BNDCFGS in host will be lost"); | ||
| 7921 | } | ||
| 7922 | |||
| 7923 | if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() || | ||
| 7924 | !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global())) | ||
| 7925 | enable_vpid = 0; | ||
| 7926 | |||
| 7927 | if (!cpu_has_vmx_ept() || | ||
| 7928 | !cpu_has_vmx_ept_4levels() || | ||
| 7929 | !cpu_has_vmx_ept_mt_wb() || | ||
| 7930 | !cpu_has_vmx_invept_global()) | ||
| 7931 | enable_ept = 0; | ||
| 7932 | |||
| 7933 | if (!cpu_has_vmx_ept_ad_bits() || !enable_ept) | ||
| 7934 | enable_ept_ad_bits = 0; | ||
| 7935 | |||
| 7936 | if (!cpu_has_vmx_unrestricted_guest() || !enable_ept) | ||
| 7937 | enable_unrestricted_guest = 0; | ||
| 7938 | |||
| 7939 | if (!cpu_has_vmx_flexpriority()) | ||
| 7940 | flexpriority_enabled = 0; | ||
| 7941 | |||
| 7942 | if (!cpu_has_virtual_nmis()) | ||
| 7943 | enable_vnmi = 0; | ||
| 7944 | |||
| 7945 | /* | ||
| 7946 | * set_apic_access_page_addr() is used to reload apic access | ||
| 7947 | * page upon invalidation. No need to do anything if not | ||
| 7948 | * using the APIC_ACCESS_ADDR VMCS field. | ||
| 7949 | */ | ||
| 7950 | if (!flexpriority_enabled) | ||
| 7951 | kvm_x86_ops->set_apic_access_page_addr = NULL; | ||
| 7952 | |||
| 7953 | if (!cpu_has_vmx_tpr_shadow()) | ||
| 7954 | kvm_x86_ops->update_cr8_intercept = NULL; | ||
| 7955 | |||
| 7956 | if (enable_ept && !cpu_has_vmx_ept_2m_page()) | ||
| 7957 | kvm_disable_largepages(); | ||
| 7958 | |||
| 7959 | #if IS_ENABLED(CONFIG_HYPERV) | ||
| 7960 | if (ms_hyperv.nested_features & HV_X64_NESTED_GUEST_MAPPING_FLUSH | ||
| 7961 | && enable_ept) | ||
| 7962 | kvm_x86_ops->tlb_remote_flush = vmx_hv_remote_flush_tlb; | ||
| 7963 | #endif | ||
| 7964 | |||
| 7965 | if (!cpu_has_vmx_ple()) { | ||
| 7966 | ple_gap = 0; | ||
| 7967 | ple_window = 0; | ||
| 7968 | ple_window_grow = 0; | ||
| 7969 | ple_window_max = 0; | ||
| 7970 | ple_window_shrink = 0; | ||
| 7971 | } | ||
| 7972 | |||
| 7973 | if (!cpu_has_vmx_apicv()) { | ||
| 7974 | enable_apicv = 0; | ||
| 7975 | kvm_x86_ops->sync_pir_to_irr = NULL; | ||
| 7976 | } | ||
| 7977 | |||
| 7978 | if (cpu_has_vmx_tsc_scaling()) { | ||
| 7979 | kvm_has_tsc_control = true; | ||
| 7980 | kvm_max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX; | ||
| 7981 | kvm_tsc_scaling_ratio_frac_bits = 48; | ||
| 7982 | } | ||
| 7983 | |||
| 7984 | set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ | ||
| 7985 | |||
| 7986 | if (enable_ept) | ||
| 7987 | vmx_enable_tdp(); | ||
| 7988 | else | ||
| 7989 | kvm_disable_tdp(); | ||
| 7990 | |||
| 7991 | if (!nested) { | ||
| 7992 | kvm_x86_ops->get_nested_state = NULL; | ||
| 7993 | kvm_x86_ops->set_nested_state = NULL; | ||
| 7994 | } | ||
| 7995 | |||
| 7996 | /* | ||
| 7997 | * Only enable PML when hardware supports PML feature, and both EPT | ||
| 7998 | * and EPT A/D bit features are enabled -- PML depends on them to work. | ||
| 7999 | */ | ||
| 8000 | if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml()) | ||
| 8001 | enable_pml = 0; | ||
| 8002 | |||
| 8003 | if (!enable_pml) { | ||
| 8004 | kvm_x86_ops->slot_enable_log_dirty = NULL; | ||
| 8005 | kvm_x86_ops->slot_disable_log_dirty = NULL; | ||
| 8006 | kvm_x86_ops->flush_log_dirty = NULL; | ||
| 8007 | kvm_x86_ops->enable_log_dirty_pt_masked = NULL; | ||
| 8008 | } | ||
| 8009 | |||
| 8010 | if (!cpu_has_vmx_preemption_timer()) | ||
| 8011 | kvm_x86_ops->request_immediate_exit = __kvm_request_immediate_exit; | ||
| 8012 | |||
| 8013 | if (cpu_has_vmx_preemption_timer() && enable_preemption_timer) { | ||
| 8014 | u64 vmx_msr; | ||
| 8015 | |||
| 8016 | rdmsrl(MSR_IA32_VMX_MISC, vmx_msr); | ||
| 8017 | cpu_preemption_timer_multi = | ||
| 8018 | vmx_msr & VMX_MISC_PREEMPTION_TIMER_RATE_MASK; | ||
| 8019 | } else { | ||
| 8020 | kvm_x86_ops->set_hv_timer = NULL; | ||
| 8021 | kvm_x86_ops->cancel_hv_timer = NULL; | ||
| 8022 | } | ||
| 8023 | |||
| 8024 | if (!cpu_has_vmx_shadow_vmcs()) | ||
| 8025 | enable_shadow_vmcs = 0; | ||
| 8026 | if (enable_shadow_vmcs) | ||
| 8027 | init_vmcs_shadow_fields(); | ||
| 8028 | |||
| 8029 | kvm_set_posted_intr_wakeup_handler(wakeup_handler); | ||
| 8030 | nested_vmx_setup_ctls_msrs(&vmcs_config.nested, enable_apicv); | ||
| 8031 | |||
| 8032 | kvm_mce_cap_supported |= MCG_LMCE_P; | ||
| 8033 | |||
| 8034 | return alloc_kvm_area(); | ||
| 8035 | |||
| 8036 | out: | ||
| 8037 | for (i = 0; i < VMX_BITMAP_NR; i++) | ||
| 8038 | free_page((unsigned long)vmx_bitmap[i]); | ||
| 8039 | |||
| 8040 | return r; | ||
| 8041 | } | ||
| 8042 | |||
| 8043 | static __exit void hardware_unsetup(void) | ||
| 8044 | { | ||
| 8045 | int i; | ||
| 8046 | |||
| 8047 | for (i = 0; i < VMX_BITMAP_NR; i++) | ||
| 8048 | free_page((unsigned long)vmx_bitmap[i]); | ||
| 8049 | |||
| 8050 | free_kvm_area(); | ||
| 8051 | } | ||
| 8052 | |||
| 8053 | /* | ||
| 8054 | * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE | ||
| 8055 | * exiting, so only get here on cpu with PAUSE-Loop-Exiting. | ||
| 8056 | */ | ||
| 8057 | static int handle_pause(struct kvm_vcpu *vcpu) | ||
| 8058 | { | ||
| 8059 | if (!kvm_pause_in_guest(vcpu->kvm)) | ||
| 8060 | grow_ple_window(vcpu); | ||
| 8061 | |||
| 8062 | /* | ||
| 8063 | * Intel sdm vol3 ch-25.1.3 says: The "PAUSE-loop exiting" | ||
| 8064 | * VM-execution control is ignored if CPL > 0. OTOH, KVM | ||
| 8065 | * never set PAUSE_EXITING and just set PLE if supported, | ||
| 8066 | * so the vcpu must be CPL=0 if it gets a PAUSE exit. | ||
| 8067 | */ | ||
| 8068 | kvm_vcpu_on_spin(vcpu, true); | ||
| 8069 | return kvm_skip_emulated_instruction(vcpu); | ||
| 8070 | } | ||
| 8071 | |||
| 8072 | static int handle_nop(struct kvm_vcpu *vcpu) | ||
| 8073 | { | ||
| 8074 | return kvm_skip_emulated_instruction(vcpu); | ||
| 8075 | } | ||
| 8076 | |||
| 8077 | static int handle_mwait(struct kvm_vcpu *vcpu) | ||
| 8078 | { | ||
| 8079 | printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n"); | ||
| 8080 | return handle_nop(vcpu); | ||
| 8081 | } | ||
| 8082 | |||
| 8083 | static int handle_invalid_op(struct kvm_vcpu *vcpu) | ||
| 8084 | { | ||
| 8085 | kvm_queue_exception(vcpu, UD_VECTOR); | ||
| 8086 | return 1; | ||
| 8087 | } | ||
| 8088 | |||
| 8089 | static int handle_monitor_trap(struct kvm_vcpu *vcpu) | ||
| 8090 | { | ||
| 8091 | return 1; | ||
| 8092 | } | ||
| 8093 | |||
| 8094 | static int handle_monitor(struct kvm_vcpu *vcpu) | ||
| 8095 | { | ||
| 8096 | printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n"); | ||
| 8097 | return handle_nop(vcpu); | ||
| 8098 | } | ||
| 8099 | |||
| 8100 | /* | ||
| 8101 | * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(), | ||
| 8102 | * set the success or error code of an emulated VMX instruction (as specified | ||
| 8103 | * by Vol 2B, VMX Instruction Reference, "Conventions"), and skip the emulated | ||
| 8104 | * instruction. | ||
| 8105 | */ | ||
| 8106 | static int nested_vmx_succeed(struct kvm_vcpu *vcpu) | ||
| 8107 | { | ||
| 8108 | vmx_set_rflags(vcpu, vmx_get_rflags(vcpu) | ||
| 8109 | & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | | ||
| 8110 | X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF)); | ||
| 8111 | return kvm_skip_emulated_instruction(vcpu); | ||
| 8112 | } | ||
| 8113 | |||
| 8114 | static int nested_vmx_failInvalid(struct kvm_vcpu *vcpu) | ||
| 8115 | { | ||
| 8116 | vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) | ||
| 8117 | & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF | | ||
| 8118 | X86_EFLAGS_SF | X86_EFLAGS_OF)) | ||
| 8119 | | X86_EFLAGS_CF); | ||
| 8120 | return kvm_skip_emulated_instruction(vcpu); | ||
| 8121 | } | ||
| 8122 | |||
| 8123 | static int nested_vmx_failValid(struct kvm_vcpu *vcpu, | ||
| 8124 | u32 vm_instruction_error) | ||
| 8125 | { | ||
| 8126 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 8127 | |||
| 8128 | /* | ||
| 8129 | * failValid writes the error number to the current VMCS, which | ||
| 8130 | * can't be done if there isn't a current VMCS. | ||
| 8131 | */ | ||
| 8132 | if (vmx->nested.current_vmptr == -1ull && !vmx->nested.hv_evmcs) | ||
| 8133 | return nested_vmx_failInvalid(vcpu); | ||
| 8134 | |||
| 8135 | vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) | ||
| 8136 | & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | | ||
| 8137 | X86_EFLAGS_SF | X86_EFLAGS_OF)) | ||
| 8138 | | X86_EFLAGS_ZF); | ||
| 8139 | get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error; | ||
| 8140 | /* | ||
| 8141 | * We don't need to force a shadow sync because | ||
| 8142 | * VM_INSTRUCTION_ERROR is not shadowed | ||
| 8143 | */ | ||
| 8144 | return kvm_skip_emulated_instruction(vcpu); | ||
| 8145 | } | ||
| 8146 | |||
| 8147 | static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator) | ||
| 8148 | { | ||
| 8149 | /* TODO: not to reset guest simply here. */ | ||
| 8150 | kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); | ||
| 8151 | pr_debug_ratelimited("kvm: nested vmx abort, indicator %d\n", indicator); | ||
| 8152 | } | ||
| 8153 | |||
| 8154 | static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer) | ||
| 8155 | { | ||
| 8156 | struct vcpu_vmx *vmx = | ||
| 8157 | container_of(timer, struct vcpu_vmx, nested.preemption_timer); | ||
| 8158 | |||
| 8159 | vmx->nested.preemption_timer_expired = true; | ||
| 8160 | kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu); | ||
| 8161 | kvm_vcpu_kick(&vmx->vcpu); | ||
| 8162 | |||
| 8163 | return HRTIMER_NORESTART; | ||
| 8164 | } | ||
| 8165 | |||
| 8166 | /* | ||
| 8167 | * Decode the memory-address operand of a vmx instruction, as recorded on an | ||
| 8168 | * exit caused by such an instruction (run by a guest hypervisor). | ||
| 8169 | * On success, returns 0. When the operand is invalid, returns 1 and throws | ||
| 8170 | * #UD or #GP. | ||
| 8171 | */ | ||
| 8172 | static int get_vmx_mem_address(struct kvm_vcpu *vcpu, | ||
| 8173 | unsigned long exit_qualification, | ||
| 8174 | u32 vmx_instruction_info, bool wr, gva_t *ret) | ||
| 8175 | { | ||
| 8176 | gva_t off; | ||
| 8177 | bool exn; | ||
| 8178 | struct kvm_segment s; | ||
| 8179 | |||
| 8180 | /* | ||
| 8181 | * According to Vol. 3B, "Information for VM Exits Due to Instruction | ||
| 8182 | * Execution", on an exit, vmx_instruction_info holds most of the | ||
| 8183 | * addressing components of the operand. Only the displacement part | ||
| 8184 | * is put in exit_qualification (see 3B, "Basic VM-Exit Information"). | ||
| 8185 | * For how an actual address is calculated from all these components, | ||
| 8186 | * refer to Vol. 1, "Operand Addressing". | ||
| 8187 | */ | ||
| 8188 | int scaling = vmx_instruction_info & 3; | ||
| 8189 | int addr_size = (vmx_instruction_info >> 7) & 7; | ||
| 8190 | bool is_reg = vmx_instruction_info & (1u << 10); | ||
| 8191 | int seg_reg = (vmx_instruction_info >> 15) & 7; | ||
| 8192 | int index_reg = (vmx_instruction_info >> 18) & 0xf; | ||
| 8193 | bool index_is_valid = !(vmx_instruction_info & (1u << 22)); | ||
| 8194 | int base_reg = (vmx_instruction_info >> 23) & 0xf; | ||
| 8195 | bool base_is_valid = !(vmx_instruction_info & (1u << 27)); | ||
| 8196 | |||
| 8197 | if (is_reg) { | ||
| 8198 | kvm_queue_exception(vcpu, UD_VECTOR); | ||
| 8199 | return 1; | ||
| 8200 | } | ||
| 8201 | |||
| 8202 | /* Addr = segment_base + offset */ | ||
| 8203 | /* offset = base + [index * scale] + displacement */ | ||
| 8204 | off = exit_qualification; /* holds the displacement */ | ||
| 8205 | if (base_is_valid) | ||
| 8206 | off += kvm_register_read(vcpu, base_reg); | ||
| 8207 | if (index_is_valid) | ||
| 8208 | off += kvm_register_read(vcpu, index_reg)<<scaling; | ||
| 8209 | vmx_get_segment(vcpu, &s, seg_reg); | ||
| 8210 | *ret = s.base + off; | ||
| 8211 | |||
| 8212 | if (addr_size == 1) /* 32 bit */ | ||
| 8213 | *ret &= 0xffffffff; | ||
| 8214 | |||
| 8215 | /* Checks for #GP/#SS exceptions. */ | ||
| 8216 | exn = false; | ||
| 8217 | if (is_long_mode(vcpu)) { | ||
| 8218 | /* Long mode: #GP(0)/#SS(0) if the memory address is in a | ||
| 8219 | * non-canonical form. This is the only check on the memory | ||
| 8220 | * destination for long mode! | ||
| 8221 | */ | ||
| 8222 | exn = is_noncanonical_address(*ret, vcpu); | ||
| 8223 | } else if (is_protmode(vcpu)) { | ||
| 8224 | /* Protected mode: apply checks for segment validity in the | ||
| 8225 | * following order: | ||
| 8226 | * - segment type check (#GP(0) may be thrown) | ||
| 8227 | * - usability check (#GP(0)/#SS(0)) | ||
| 8228 | * - limit check (#GP(0)/#SS(0)) | ||
| 8229 | */ | ||
| 8230 | if (wr) | ||
| 8231 | /* #GP(0) if the destination operand is located in a | ||
| 8232 | * read-only data segment or any code segment. | ||
| 8233 | */ | ||
| 8234 | exn = ((s.type & 0xa) == 0 || (s.type & 8)); | ||
| 8235 | else | ||
| 8236 | /* #GP(0) if the source operand is located in an | ||
| 8237 | * execute-only code segment | ||
| 8238 | */ | ||
| 8239 | exn = ((s.type & 0xa) == 8); | ||
| 8240 | if (exn) { | ||
| 8241 | kvm_queue_exception_e(vcpu, GP_VECTOR, 0); | ||
| 8242 | return 1; | ||
| 8243 | } | ||
| 8244 | /* Protected mode: #GP(0)/#SS(0) if the segment is unusable. | ||
| 8245 | */ | ||
| 8246 | exn = (s.unusable != 0); | ||
| 8247 | /* Protected mode: #GP(0)/#SS(0) if the memory | ||
| 8248 | * operand is outside the segment limit. | ||
| 8249 | */ | ||
| 8250 | exn = exn || (off + sizeof(u64) > s.limit); | ||
| 8251 | } | ||
| 8252 | if (exn) { | ||
| 8253 | kvm_queue_exception_e(vcpu, | ||
| 8254 | seg_reg == VCPU_SREG_SS ? | ||
| 8255 | SS_VECTOR : GP_VECTOR, | ||
| 8256 | 0); | ||
| 8257 | return 1; | ||
| 8258 | } | ||
| 8259 | |||
| 8260 | return 0; | ||
| 8261 | } | ||
| 8262 | |||
| 8263 | static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer) | ||
| 8264 | { | ||
| 8265 | gva_t gva; | ||
| 8266 | struct x86_exception e; | ||
| 8267 | |||
| 8268 | if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), | ||
| 8269 | vmcs_read32(VMX_INSTRUCTION_INFO), false, &gva)) | ||
| 8270 | return 1; | ||
| 8271 | |||
| 8272 | if (kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e)) { | ||
| 8273 | kvm_inject_page_fault(vcpu, &e); | ||
| 8274 | return 1; | ||
| 8275 | } | ||
| 8276 | |||
| 8277 | return 0; | ||
| 8278 | } | ||
| 8279 | |||
| 8280 | /* | ||
| 8281 | * Allocate a shadow VMCS and associate it with the currently loaded | ||
| 8282 | * VMCS, unless such a shadow VMCS already exists. The newly allocated | ||
| 8283 | * VMCS is also VMCLEARed, so that it is ready for use. | ||
| 8284 | */ | ||
| 8285 | static struct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu) | ||
| 8286 | { | ||
| 8287 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 8288 | struct loaded_vmcs *loaded_vmcs = vmx->loaded_vmcs; | ||
| 8289 | |||
| 8290 | /* | ||
| 8291 | * We should allocate a shadow vmcs for vmcs01 only when L1 | ||
| 8292 | * executes VMXON and free it when L1 executes VMXOFF. | ||
| 8293 | * As it is invalid to execute VMXON twice, we shouldn't reach | ||
| 8294 | * here when vmcs01 already have an allocated shadow vmcs. | ||
| 8295 | */ | ||
| 8296 | WARN_ON(loaded_vmcs == &vmx->vmcs01 && loaded_vmcs->shadow_vmcs); | ||
| 8297 | |||
| 8298 | if (!loaded_vmcs->shadow_vmcs) { | ||
| 8299 | loaded_vmcs->shadow_vmcs = alloc_vmcs(true); | ||
| 8300 | if (loaded_vmcs->shadow_vmcs) | ||
| 8301 | vmcs_clear(loaded_vmcs->shadow_vmcs); | ||
| 8302 | } | ||
| 8303 | return loaded_vmcs->shadow_vmcs; | ||
| 8304 | } | ||
| 8305 | |||
| 8306 | static int enter_vmx_operation(struct kvm_vcpu *vcpu) | ||
| 8307 | { | ||
| 8308 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 8309 | int r; | ||
| 8310 | |||
| 8311 | r = alloc_loaded_vmcs(&vmx->nested.vmcs02); | ||
| 8312 | if (r < 0) | ||
| 8313 | goto out_vmcs02; | ||
| 8314 | |||
| 8315 | vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL); | ||
| 8316 | if (!vmx->nested.cached_vmcs12) | ||
| 8317 | goto out_cached_vmcs12; | ||
| 8318 | |||
| 8319 | vmx->nested.cached_shadow_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL); | ||
| 8320 | if (!vmx->nested.cached_shadow_vmcs12) | ||
| 8321 | goto out_cached_shadow_vmcs12; | ||
| 8322 | |||
| 8323 | if (enable_shadow_vmcs && !alloc_shadow_vmcs(vcpu)) | ||
| 8324 | goto out_shadow_vmcs; | ||
| 8325 | |||
| 8326 | hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC, | ||
| 8327 | HRTIMER_MODE_REL_PINNED); | ||
| 8328 | vmx->nested.preemption_timer.function = vmx_preemption_timer_fn; | ||
| 8329 | |||
| 8330 | vmx->nested.vpid02 = allocate_vpid(); | ||
| 8331 | |||
| 8332 | vmx->nested.vmcs02_initialized = false; | ||
| 8333 | vmx->nested.vmxon = true; | ||
| 8334 | return 0; | ||
| 8335 | |||
| 8336 | out_shadow_vmcs: | ||
| 8337 | kfree(vmx->nested.cached_shadow_vmcs12); | ||
| 8338 | |||
| 8339 | out_cached_shadow_vmcs12: | ||
| 8340 | kfree(vmx->nested.cached_vmcs12); | ||
| 8341 | |||
| 8342 | out_cached_vmcs12: | ||
| 8343 | free_loaded_vmcs(&vmx->nested.vmcs02); | ||
| 8344 | |||
| 8345 | out_vmcs02: | ||
| 8346 | return -ENOMEM; | ||
| 8347 | } | ||
| 8348 | |||
| 8349 | /* | ||
| 8350 | * Emulate the VMXON instruction. | ||
| 8351 | * Currently, we just remember that VMX is active, and do not save or even | ||
| 8352 | * inspect the argument to VMXON (the so-called "VMXON pointer") because we | ||
| 8353 | * do not currently need to store anything in that guest-allocated memory | ||
| 8354 | * region. Consequently, VMCLEAR and VMPTRLD also do not verify that the their | ||
| 8355 | * argument is different from the VMXON pointer (which the spec says they do). | ||
| 8356 | */ | ||
| 8357 | static int handle_vmon(struct kvm_vcpu *vcpu) | ||
| 8358 | { | ||
| 8359 | int ret; | ||
| 8360 | gpa_t vmptr; | ||
| 8361 | struct page *page; | ||
| 8362 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 8363 | const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED | ||
| 8364 | | FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; | ||
| 8365 | |||
| 8366 | /* | ||
| 8367 | * The Intel VMX Instruction Reference lists a bunch of bits that are | ||
| 8368 | * prerequisite to running VMXON, most notably cr4.VMXE must be set to | ||
| 8369 | * 1 (see vmx_set_cr4() for when we allow the guest to set this). | ||
| 8370 | * Otherwise, we should fail with #UD. But most faulting conditions | ||
| 8371 | * have already been checked by hardware, prior to the VM-exit for | ||
| 8372 | * VMXON. We do test guest cr4.VMXE because processor CR4 always has | ||
| 8373 | * that bit set to 1 in non-root mode. | ||
| 8374 | */ | ||
| 8375 | if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE)) { | ||
| 8376 | kvm_queue_exception(vcpu, UD_VECTOR); | ||
| 8377 | return 1; | ||
| 8378 | } | ||
| 8379 | |||
| 8380 | /* CPL=0 must be checked manually. */ | ||
| 8381 | if (vmx_get_cpl(vcpu)) { | ||
| 8382 | kvm_inject_gp(vcpu, 0); | ||
| 8383 | return 1; | ||
| 8384 | } | ||
| 8385 | |||
| 8386 | if (vmx->nested.vmxon) | ||
| 8387 | return nested_vmx_failValid(vcpu, | ||
| 8388 | VMXERR_VMXON_IN_VMX_ROOT_OPERATION); | ||
| 8389 | |||
| 8390 | if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES) | ||
| 8391 | != VMXON_NEEDED_FEATURES) { | ||
| 8392 | kvm_inject_gp(vcpu, 0); | ||
| 8393 | return 1; | ||
| 8394 | } | ||
| 8395 | |||
| 8396 | if (nested_vmx_get_vmptr(vcpu, &vmptr)) | ||
| 8397 | return 1; | ||
| 8398 | |||
| 8399 | /* | ||
| 8400 | * SDM 3: 24.11.5 | ||
| 8401 | * The first 4 bytes of VMXON region contain the supported | ||
| 8402 | * VMCS revision identifier | ||
| 8403 | * | ||
| 8404 | * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case; | ||
| 8405 | * which replaces physical address width with 32 | ||
| 8406 | */ | ||
| 8407 | if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) | ||
| 8408 | return nested_vmx_failInvalid(vcpu); | ||
| 8409 | |||
| 8410 | page = kvm_vcpu_gpa_to_page(vcpu, vmptr); | ||
| 8411 | if (is_error_page(page)) | ||
| 8412 | return nested_vmx_failInvalid(vcpu); | ||
| 8413 | |||
| 8414 | if (*(u32 *)kmap(page) != VMCS12_REVISION) { | ||
| 8415 | kunmap(page); | ||
| 8416 | kvm_release_page_clean(page); | ||
| 8417 | return nested_vmx_failInvalid(vcpu); | ||
| 8418 | } | ||
| 8419 | kunmap(page); | ||
| 8420 | kvm_release_page_clean(page); | ||
| 8421 | |||
| 8422 | vmx->nested.vmxon_ptr = vmptr; | ||
| 8423 | ret = enter_vmx_operation(vcpu); | ||
| 8424 | if (ret) | ||
| 8425 | return ret; | ||
| 8426 | |||
| 8427 | return nested_vmx_succeed(vcpu); | ||
| 8428 | } | ||
| 8429 | |||
| 8430 | /* | ||
| 8431 | * Intel's VMX Instruction Reference specifies a common set of prerequisites | ||
| 8432 | * for running VMX instructions (except VMXON, whose prerequisites are | ||
| 8433 | * slightly different). It also specifies what exception to inject otherwise. | ||
| 8434 | * Note that many of these exceptions have priority over VM exits, so they | ||
| 8435 | * don't have to be checked again here. | ||
| 8436 | */ | ||
| 8437 | static int nested_vmx_check_permission(struct kvm_vcpu *vcpu) | ||
| 8438 | { | ||
| 8439 | if (!to_vmx(vcpu)->nested.vmxon) { | ||
| 8440 | kvm_queue_exception(vcpu, UD_VECTOR); | ||
| 8441 | return 0; | ||
| 8442 | } | ||
| 8443 | |||
| 8444 | if (vmx_get_cpl(vcpu)) { | ||
| 8445 | kvm_inject_gp(vcpu, 0); | ||
| 8446 | return 0; | ||
| 8447 | } | ||
| 8448 | |||
| 8449 | return 1; | ||
| 8450 | } | ||
| 8451 | |||
| 8452 | static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx) | ||
| 8453 | { | ||
| 8454 | vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, SECONDARY_EXEC_SHADOW_VMCS); | ||
| 8455 | vmcs_write64(VMCS_LINK_POINTER, -1ull); | ||
| 8456 | } | ||
| 8457 | |||
| 8458 | static inline void nested_release_evmcs(struct kvm_vcpu *vcpu) | ||
| 8459 | { | ||
| 8460 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 8461 | |||
| 8462 | if (!vmx->nested.hv_evmcs) | ||
| 8463 | return; | ||
| 8464 | |||
| 8465 | kunmap(vmx->nested.hv_evmcs_page); | ||
| 8466 | kvm_release_page_dirty(vmx->nested.hv_evmcs_page); | ||
| 8467 | vmx->nested.hv_evmcs_vmptr = -1ull; | ||
| 8468 | vmx->nested.hv_evmcs_page = NULL; | ||
| 8469 | vmx->nested.hv_evmcs = NULL; | ||
| 8470 | } | ||
| 8471 | |||
| 8472 | static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu) | ||
| 8473 | { | ||
| 8474 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 8475 | |||
| 8476 | if (vmx->nested.current_vmptr == -1ull) | ||
| 8477 | return; | ||
| 8478 | |||
| 8479 | if (enable_shadow_vmcs) { | ||
| 8480 | /* copy to memory all shadowed fields in case | ||
| 8481 | they were modified */ | ||
| 8482 | copy_shadow_to_vmcs12(vmx); | ||
| 8483 | vmx->nested.need_vmcs12_sync = false; | ||
| 8484 | vmx_disable_shadow_vmcs(vmx); | ||
| 8485 | } | ||
| 8486 | vmx->nested.posted_intr_nv = -1; | ||
| 8487 | |||
| 8488 | /* Flush VMCS12 to guest memory */ | ||
| 8489 | kvm_vcpu_write_guest_page(vcpu, | ||
| 8490 | vmx->nested.current_vmptr >> PAGE_SHIFT, | ||
| 8491 | vmx->nested.cached_vmcs12, 0, VMCS12_SIZE); | ||
| 8492 | |||
| 8493 | kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); | ||
| 8494 | |||
| 8495 | vmx->nested.current_vmptr = -1ull; | ||
| 8496 | } | ||
| 8497 | |||
| 8498 | /* | ||
| 8499 | * Free whatever needs to be freed from vmx->nested when L1 goes down, or | ||
| 8500 | * just stops using VMX. | ||
| 8501 | */ | ||
| 8502 | static void free_nested(struct kvm_vcpu *vcpu) | ||
| 8503 | { | ||
| 8504 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 8505 | |||
| 8506 | if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon) | ||
| 8507 | return; | ||
| 8508 | |||
| 8509 | vmx->nested.vmxon = false; | ||
| 8510 | vmx->nested.smm.vmxon = false; | ||
| 8511 | free_vpid(vmx->nested.vpid02); | ||
| 8512 | vmx->nested.posted_intr_nv = -1; | ||
| 8513 | vmx->nested.current_vmptr = -1ull; | ||
| 8514 | if (enable_shadow_vmcs) { | ||
| 8515 | vmx_disable_shadow_vmcs(vmx); | ||
| 8516 | vmcs_clear(vmx->vmcs01.shadow_vmcs); | ||
| 8517 | free_vmcs(vmx->vmcs01.shadow_vmcs); | ||
| 8518 | vmx->vmcs01.shadow_vmcs = NULL; | ||
| 8519 | } | ||
| 8520 | kfree(vmx->nested.cached_vmcs12); | ||
| 8521 | kfree(vmx->nested.cached_shadow_vmcs12); | ||
| 8522 | /* Unpin physical memory we referred to in the vmcs02 */ | ||
| 8523 | if (vmx->nested.apic_access_page) { | ||
| 8524 | kvm_release_page_dirty(vmx->nested.apic_access_page); | ||
| 8525 | vmx->nested.apic_access_page = NULL; | ||
| 8526 | } | ||
| 8527 | if (vmx->nested.virtual_apic_page) { | ||
| 8528 | kvm_release_page_dirty(vmx->nested.virtual_apic_page); | ||
| 8529 | vmx->nested.virtual_apic_page = NULL; | ||
| 8530 | } | ||
| 8531 | if (vmx->nested.pi_desc_page) { | ||
| 8532 | kunmap(vmx->nested.pi_desc_page); | ||
| 8533 | kvm_release_page_dirty(vmx->nested.pi_desc_page); | ||
| 8534 | vmx->nested.pi_desc_page = NULL; | ||
| 8535 | vmx->nested.pi_desc = NULL; | ||
| 8536 | } | ||
| 8537 | |||
| 8538 | kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); | ||
| 8539 | |||
| 8540 | nested_release_evmcs(vcpu); | ||
| 8541 | |||
| 8542 | free_loaded_vmcs(&vmx->nested.vmcs02); | ||
| 8543 | } | ||
| 8544 | |||
| 8545 | /* Emulate the VMXOFF instruction */ | ||
| 8546 | static int handle_vmoff(struct kvm_vcpu *vcpu) | ||
| 8547 | { | ||
| 8548 | if (!nested_vmx_check_permission(vcpu)) | ||
| 8549 | return 1; | ||
| 8550 | free_nested(vcpu); | ||
| 8551 | return nested_vmx_succeed(vcpu); | ||
| 8552 | } | ||
| 8553 | |||
| 8554 | /* Emulate the VMCLEAR instruction */ | ||
| 8555 | static int handle_vmclear(struct kvm_vcpu *vcpu) | ||
| 8556 | { | ||
| 8557 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 8558 | u32 zero = 0; | ||
| 8559 | gpa_t vmptr; | ||
| 8560 | |||
| 8561 | if (!nested_vmx_check_permission(vcpu)) | ||
| 8562 | return 1; | ||
| 8563 | |||
| 8564 | if (nested_vmx_get_vmptr(vcpu, &vmptr)) | ||
| 8565 | return 1; | ||
| 8566 | |||
| 8567 | if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) | ||
| 8568 | return nested_vmx_failValid(vcpu, | ||
| 8569 | VMXERR_VMCLEAR_INVALID_ADDRESS); | ||
| 8570 | |||
| 8571 | if (vmptr == vmx->nested.vmxon_ptr) | ||
| 8572 | return nested_vmx_failValid(vcpu, | ||
| 8573 | VMXERR_VMCLEAR_VMXON_POINTER); | ||
| 8574 | |||
| 8575 | if (vmx->nested.hv_evmcs_page) { | ||
| 8576 | if (vmptr == vmx->nested.hv_evmcs_vmptr) | ||
| 8577 | nested_release_evmcs(vcpu); | ||
| 8578 | } else { | ||
| 8579 | if (vmptr == vmx->nested.current_vmptr) | ||
| 8580 | nested_release_vmcs12(vcpu); | ||
| 8581 | |||
| 8582 | kvm_vcpu_write_guest(vcpu, | ||
| 8583 | vmptr + offsetof(struct vmcs12, | ||
| 8584 | launch_state), | ||
| 8585 | &zero, sizeof(zero)); | ||
| 8586 | } | ||
| 8587 | |||
| 8588 | return nested_vmx_succeed(vcpu); | ||
| 8589 | } | ||
| 8590 | |||
| 8591 | static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch); | ||
| 8592 | |||
| 8593 | /* Emulate the VMLAUNCH instruction */ | ||
| 8594 | static int handle_vmlaunch(struct kvm_vcpu *vcpu) | ||
| 8595 | { | ||
| 8596 | return nested_vmx_run(vcpu, true); | ||
| 8597 | } | ||
| 8598 | |||
| 8599 | /* Emulate the VMRESUME instruction */ | ||
| 8600 | static int handle_vmresume(struct kvm_vcpu *vcpu) | ||
| 8601 | { | ||
| 8602 | |||
| 8603 | return nested_vmx_run(vcpu, false); | ||
| 8604 | } | ||
| 8605 | |||
| 8606 | /* | ||
| 8607 | * Read a vmcs12 field. Since these can have varying lengths and we return | ||
| 8608 | * one type, we chose the biggest type (u64) and zero-extend the return value | ||
| 8609 | * to that size. Note that the caller, handle_vmread, might need to use only | ||
| 8610 | * some of the bits we return here (e.g., on 32-bit guests, only 32 bits of | ||
| 8611 | * 64-bit fields are to be returned). | ||
| 8612 | */ | ||
| 8613 | static inline int vmcs12_read_any(struct vmcs12 *vmcs12, | ||
| 8614 | unsigned long field, u64 *ret) | ||
| 8615 | { | ||
| 8616 | short offset = vmcs_field_to_offset(field); | ||
| 8617 | char *p; | ||
| 8618 | |||
| 8619 | if (offset < 0) | ||
| 8620 | return offset; | ||
| 8621 | |||
| 8622 | p = (char *)vmcs12 + offset; | ||
| 8623 | |||
| 8624 | switch (vmcs_field_width(field)) { | ||
| 8625 | case VMCS_FIELD_WIDTH_NATURAL_WIDTH: | ||
| 8626 | *ret = *((natural_width *)p); | ||
| 8627 | return 0; | ||
| 8628 | case VMCS_FIELD_WIDTH_U16: | ||
| 8629 | *ret = *((u16 *)p); | ||
| 8630 | return 0; | ||
| 8631 | case VMCS_FIELD_WIDTH_U32: | ||
| 8632 | *ret = *((u32 *)p); | ||
| 8633 | return 0; | ||
| 8634 | case VMCS_FIELD_WIDTH_U64: | ||
| 8635 | *ret = *((u64 *)p); | ||
| 8636 | return 0; | ||
| 8637 | default: | ||
| 8638 | WARN_ON(1); | ||
| 8639 | return -ENOENT; | ||
| 8640 | } | ||
| 8641 | } | ||
| 8642 | |||
| 8643 | |||
| 8644 | static inline int vmcs12_write_any(struct vmcs12 *vmcs12, | ||
| 8645 | unsigned long field, u64 field_value){ | ||
| 8646 | short offset = vmcs_field_to_offset(field); | ||
| 8647 | char *p = (char *)vmcs12 + offset; | ||
| 8648 | if (offset < 0) | ||
| 8649 | return offset; | ||
| 8650 | |||
| 8651 | switch (vmcs_field_width(field)) { | ||
| 8652 | case VMCS_FIELD_WIDTH_U16: | ||
| 8653 | *(u16 *)p = field_value; | ||
| 8654 | return 0; | ||
| 8655 | case VMCS_FIELD_WIDTH_U32: | ||
| 8656 | *(u32 *)p = field_value; | ||
| 8657 | return 0; | ||
| 8658 | case VMCS_FIELD_WIDTH_U64: | ||
| 8659 | *(u64 *)p = field_value; | ||
| 8660 | return 0; | ||
| 8661 | case VMCS_FIELD_WIDTH_NATURAL_WIDTH: | ||
| 8662 | *(natural_width *)p = field_value; | ||
| 8663 | return 0; | ||
| 8664 | default: | ||
| 8665 | WARN_ON(1); | ||
| 8666 | return -ENOENT; | ||
| 8667 | } | ||
| 8668 | |||
| 8669 | } | ||
| 8670 | |||
| 8671 | static int copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx) | ||
| 8672 | { | ||
| 8673 | struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; | ||
| 8674 | struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs; | ||
| 8675 | |||
| 8676 | /* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */ | ||
| 8677 | vmcs12->tpr_threshold = evmcs->tpr_threshold; | ||
| 8678 | vmcs12->guest_rip = evmcs->guest_rip; | ||
| 8679 | |||
| 8680 | if (unlikely(!(evmcs->hv_clean_fields & | ||
| 8681 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC))) { | ||
| 8682 | vmcs12->guest_rsp = evmcs->guest_rsp; | ||
| 8683 | vmcs12->guest_rflags = evmcs->guest_rflags; | ||
| 8684 | vmcs12->guest_interruptibility_info = | ||
| 8685 | evmcs->guest_interruptibility_info; | ||
| 8686 | } | ||
| 8687 | |||
| 8688 | if (unlikely(!(evmcs->hv_clean_fields & | ||
| 8689 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) { | ||
| 8690 | vmcs12->cpu_based_vm_exec_control = | ||
| 8691 | evmcs->cpu_based_vm_exec_control; | ||
| 8692 | } | ||
| 8693 | |||
| 8694 | if (unlikely(!(evmcs->hv_clean_fields & | ||
| 8695 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) { | ||
| 8696 | vmcs12->exception_bitmap = evmcs->exception_bitmap; | ||
| 8697 | } | ||
| 8698 | |||
| 8699 | if (unlikely(!(evmcs->hv_clean_fields & | ||
| 8700 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY))) { | ||
| 8701 | vmcs12->vm_entry_controls = evmcs->vm_entry_controls; | ||
| 8702 | } | ||
| 8703 | |||
| 8704 | if (unlikely(!(evmcs->hv_clean_fields & | ||
| 8705 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT))) { | ||
| 8706 | vmcs12->vm_entry_intr_info_field = | ||
| 8707 | evmcs->vm_entry_intr_info_field; | ||
| 8708 | vmcs12->vm_entry_exception_error_code = | ||
| 8709 | evmcs->vm_entry_exception_error_code; | ||
| 8710 | vmcs12->vm_entry_instruction_len = | ||
| 8711 | evmcs->vm_entry_instruction_len; | ||
| 8712 | } | ||
| 8713 | |||
| 8714 | if (unlikely(!(evmcs->hv_clean_fields & | ||
| 8715 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) { | ||
| 8716 | vmcs12->host_ia32_pat = evmcs->host_ia32_pat; | ||
| 8717 | vmcs12->host_ia32_efer = evmcs->host_ia32_efer; | ||
| 8718 | vmcs12->host_cr0 = evmcs->host_cr0; | ||
| 8719 | vmcs12->host_cr3 = evmcs->host_cr3; | ||
| 8720 | vmcs12->host_cr4 = evmcs->host_cr4; | ||
| 8721 | vmcs12->host_ia32_sysenter_esp = evmcs->host_ia32_sysenter_esp; | ||
| 8722 | vmcs12->host_ia32_sysenter_eip = evmcs->host_ia32_sysenter_eip; | ||
| 8723 | vmcs12->host_rip = evmcs->host_rip; | ||
| 8724 | vmcs12->host_ia32_sysenter_cs = evmcs->host_ia32_sysenter_cs; | ||
| 8725 | vmcs12->host_es_selector = evmcs->host_es_selector; | ||
| 8726 | vmcs12->host_cs_selector = evmcs->host_cs_selector; | ||
| 8727 | vmcs12->host_ss_selector = evmcs->host_ss_selector; | ||
| 8728 | vmcs12->host_ds_selector = evmcs->host_ds_selector; | ||
| 8729 | vmcs12->host_fs_selector = evmcs->host_fs_selector; | ||
| 8730 | vmcs12->host_gs_selector = evmcs->host_gs_selector; | ||
| 8731 | vmcs12->host_tr_selector = evmcs->host_tr_selector; | ||
| 8732 | } | ||
| 8733 | |||
| 8734 | if (unlikely(!(evmcs->hv_clean_fields & | ||
| 8735 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) { | ||
| 8736 | vmcs12->pin_based_vm_exec_control = | ||
| 8737 | evmcs->pin_based_vm_exec_control; | ||
| 8738 | vmcs12->vm_exit_controls = evmcs->vm_exit_controls; | ||
| 8739 | vmcs12->secondary_vm_exec_control = | ||
| 8740 | evmcs->secondary_vm_exec_control; | ||
| 8741 | } | ||
| 8742 | |||
| 8743 | if (unlikely(!(evmcs->hv_clean_fields & | ||
| 8744 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP))) { | ||
| 8745 | vmcs12->io_bitmap_a = evmcs->io_bitmap_a; | ||
| 8746 | vmcs12->io_bitmap_b = evmcs->io_bitmap_b; | ||
| 8747 | } | ||
| 8748 | |||
| 8749 | if (unlikely(!(evmcs->hv_clean_fields & | ||
| 8750 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP))) { | ||
| 8751 | vmcs12->msr_bitmap = evmcs->msr_bitmap; | ||
| 8752 | } | ||
| 8753 | |||
| 8754 | if (unlikely(!(evmcs->hv_clean_fields & | ||
| 8755 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2))) { | ||
| 8756 | vmcs12->guest_es_base = evmcs->guest_es_base; | ||
| 8757 | vmcs12->guest_cs_base = evmcs->guest_cs_base; | ||
| 8758 | vmcs12->guest_ss_base = evmcs->guest_ss_base; | ||
| 8759 | vmcs12->guest_ds_base = evmcs->guest_ds_base; | ||
| 8760 | vmcs12->guest_fs_base = evmcs->guest_fs_base; | ||
| 8761 | vmcs12->guest_gs_base = evmcs->guest_gs_base; | ||
| 8762 | vmcs12->guest_ldtr_base = evmcs->guest_ldtr_base; | ||
| 8763 | vmcs12->guest_tr_base = evmcs->guest_tr_base; | ||
| 8764 | vmcs12->guest_gdtr_base = evmcs->guest_gdtr_base; | ||
| 8765 | vmcs12->guest_idtr_base = evmcs->guest_idtr_base; | ||
| 8766 | vmcs12->guest_es_limit = evmcs->guest_es_limit; | ||
| 8767 | vmcs12->guest_cs_limit = evmcs->guest_cs_limit; | ||
| 8768 | vmcs12->guest_ss_limit = evmcs->guest_ss_limit; | ||
| 8769 | vmcs12->guest_ds_limit = evmcs->guest_ds_limit; | ||
| 8770 | vmcs12->guest_fs_limit = evmcs->guest_fs_limit; | ||
| 8771 | vmcs12->guest_gs_limit = evmcs->guest_gs_limit; | ||
| 8772 | vmcs12->guest_ldtr_limit = evmcs->guest_ldtr_limit; | ||
| 8773 | vmcs12->guest_tr_limit = evmcs->guest_tr_limit; | ||
| 8774 | vmcs12->guest_gdtr_limit = evmcs->guest_gdtr_limit; | ||
| 8775 | vmcs12->guest_idtr_limit = evmcs->guest_idtr_limit; | ||
| 8776 | vmcs12->guest_es_ar_bytes = evmcs->guest_es_ar_bytes; | ||
| 8777 | vmcs12->guest_cs_ar_bytes = evmcs->guest_cs_ar_bytes; | ||
| 8778 | vmcs12->guest_ss_ar_bytes = evmcs->guest_ss_ar_bytes; | ||
| 8779 | vmcs12->guest_ds_ar_bytes = evmcs->guest_ds_ar_bytes; | ||
| 8780 | vmcs12->guest_fs_ar_bytes = evmcs->guest_fs_ar_bytes; | ||
| 8781 | vmcs12->guest_gs_ar_bytes = evmcs->guest_gs_ar_bytes; | ||
| 8782 | vmcs12->guest_ldtr_ar_bytes = evmcs->guest_ldtr_ar_bytes; | ||
| 8783 | vmcs12->guest_tr_ar_bytes = evmcs->guest_tr_ar_bytes; | ||
| 8784 | vmcs12->guest_es_selector = evmcs->guest_es_selector; | ||
| 8785 | vmcs12->guest_cs_selector = evmcs->guest_cs_selector; | ||
| 8786 | vmcs12->guest_ss_selector = evmcs->guest_ss_selector; | ||
| 8787 | vmcs12->guest_ds_selector = evmcs->guest_ds_selector; | ||
| 8788 | vmcs12->guest_fs_selector = evmcs->guest_fs_selector; | ||
| 8789 | vmcs12->guest_gs_selector = evmcs->guest_gs_selector; | ||
| 8790 | vmcs12->guest_ldtr_selector = evmcs->guest_ldtr_selector; | ||
| 8791 | vmcs12->guest_tr_selector = evmcs->guest_tr_selector; | ||
| 8792 | } | ||
| 8793 | |||
| 8794 | if (unlikely(!(evmcs->hv_clean_fields & | ||
| 8795 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2))) { | ||
| 8796 | vmcs12->tsc_offset = evmcs->tsc_offset; | ||
| 8797 | vmcs12->virtual_apic_page_addr = evmcs->virtual_apic_page_addr; | ||
| 8798 | vmcs12->xss_exit_bitmap = evmcs->xss_exit_bitmap; | ||
| 8799 | } | ||
| 8800 | |||
| 8801 | if (unlikely(!(evmcs->hv_clean_fields & | ||
| 8802 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR))) { | ||
| 8803 | vmcs12->cr0_guest_host_mask = evmcs->cr0_guest_host_mask; | ||
| 8804 | vmcs12->cr4_guest_host_mask = evmcs->cr4_guest_host_mask; | ||
| 8805 | vmcs12->cr0_read_shadow = evmcs->cr0_read_shadow; | ||
| 8806 | vmcs12->cr4_read_shadow = evmcs->cr4_read_shadow; | ||
| 8807 | vmcs12->guest_cr0 = evmcs->guest_cr0; | ||
| 8808 | vmcs12->guest_cr3 = evmcs->guest_cr3; | ||
| 8809 | vmcs12->guest_cr4 = evmcs->guest_cr4; | ||
| 8810 | vmcs12->guest_dr7 = evmcs->guest_dr7; | ||
| 8811 | } | ||
| 8812 | |||
| 8813 | if (unlikely(!(evmcs->hv_clean_fields & | ||
| 8814 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER))) { | ||
| 8815 | vmcs12->host_fs_base = evmcs->host_fs_base; | ||
| 8816 | vmcs12->host_gs_base = evmcs->host_gs_base; | ||
| 8817 | vmcs12->host_tr_base = evmcs->host_tr_base; | ||
| 8818 | vmcs12->host_gdtr_base = evmcs->host_gdtr_base; | ||
| 8819 | vmcs12->host_idtr_base = evmcs->host_idtr_base; | ||
| 8820 | vmcs12->host_rsp = evmcs->host_rsp; | ||
| 8821 | } | ||
| 8822 | |||
| 8823 | if (unlikely(!(evmcs->hv_clean_fields & | ||
| 8824 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT))) { | ||
| 8825 | vmcs12->ept_pointer = evmcs->ept_pointer; | ||
| 8826 | vmcs12->virtual_processor_id = evmcs->virtual_processor_id; | ||
| 8827 | } | ||
| 8828 | |||
| 8829 | if (unlikely(!(evmcs->hv_clean_fields & | ||
| 8830 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1))) { | ||
| 8831 | vmcs12->vmcs_link_pointer = evmcs->vmcs_link_pointer; | ||
| 8832 | vmcs12->guest_ia32_debugctl = evmcs->guest_ia32_debugctl; | ||
| 8833 | vmcs12->guest_ia32_pat = evmcs->guest_ia32_pat; | ||
| 8834 | vmcs12->guest_ia32_efer = evmcs->guest_ia32_efer; | ||
| 8835 | vmcs12->guest_pdptr0 = evmcs->guest_pdptr0; | ||
| 8836 | vmcs12->guest_pdptr1 = evmcs->guest_pdptr1; | ||
| 8837 | vmcs12->guest_pdptr2 = evmcs->guest_pdptr2; | ||
| 8838 | vmcs12->guest_pdptr3 = evmcs->guest_pdptr3; | ||
| 8839 | vmcs12->guest_pending_dbg_exceptions = | ||
| 8840 | evmcs->guest_pending_dbg_exceptions; | ||
| 8841 | vmcs12->guest_sysenter_esp = evmcs->guest_sysenter_esp; | ||
| 8842 | vmcs12->guest_sysenter_eip = evmcs->guest_sysenter_eip; | ||
| 8843 | vmcs12->guest_bndcfgs = evmcs->guest_bndcfgs; | ||
| 8844 | vmcs12->guest_activity_state = evmcs->guest_activity_state; | ||
| 8845 | vmcs12->guest_sysenter_cs = evmcs->guest_sysenter_cs; | ||
| 8846 | } | ||
| 8847 | |||
| 8848 | /* | ||
| 8849 | * Not used? | ||
| 8850 | * vmcs12->vm_exit_msr_store_addr = evmcs->vm_exit_msr_store_addr; | ||
| 8851 | * vmcs12->vm_exit_msr_load_addr = evmcs->vm_exit_msr_load_addr; | ||
| 8852 | * vmcs12->vm_entry_msr_load_addr = evmcs->vm_entry_msr_load_addr; | ||
| 8853 | * vmcs12->cr3_target_value0 = evmcs->cr3_target_value0; | ||
| 8854 | * vmcs12->cr3_target_value1 = evmcs->cr3_target_value1; | ||
| 8855 | * vmcs12->cr3_target_value2 = evmcs->cr3_target_value2; | ||
| 8856 | * vmcs12->cr3_target_value3 = evmcs->cr3_target_value3; | ||
| 8857 | * vmcs12->page_fault_error_code_mask = | ||
| 8858 | * evmcs->page_fault_error_code_mask; | ||
| 8859 | * vmcs12->page_fault_error_code_match = | ||
| 8860 | * evmcs->page_fault_error_code_match; | ||
| 8861 | * vmcs12->cr3_target_count = evmcs->cr3_target_count; | ||
| 8862 | * vmcs12->vm_exit_msr_store_count = evmcs->vm_exit_msr_store_count; | ||
| 8863 | * vmcs12->vm_exit_msr_load_count = evmcs->vm_exit_msr_load_count; | ||
| 8864 | * vmcs12->vm_entry_msr_load_count = evmcs->vm_entry_msr_load_count; | ||
| 8865 | */ | ||
| 8866 | |||
| 8867 | /* | ||
| 8868 | * Read only fields: | ||
| 8869 | * vmcs12->guest_physical_address = evmcs->guest_physical_address; | ||
| 8870 | * vmcs12->vm_instruction_error = evmcs->vm_instruction_error; | ||
| 8871 | * vmcs12->vm_exit_reason = evmcs->vm_exit_reason; | ||
| 8872 | * vmcs12->vm_exit_intr_info = evmcs->vm_exit_intr_info; | ||
| 8873 | * vmcs12->vm_exit_intr_error_code = evmcs->vm_exit_intr_error_code; | ||
| 8874 | * vmcs12->idt_vectoring_info_field = evmcs->idt_vectoring_info_field; | ||
| 8875 | * vmcs12->idt_vectoring_error_code = evmcs->idt_vectoring_error_code; | ||
| 8876 | * vmcs12->vm_exit_instruction_len = evmcs->vm_exit_instruction_len; | ||
| 8877 | * vmcs12->vmx_instruction_info = evmcs->vmx_instruction_info; | ||
| 8878 | * vmcs12->exit_qualification = evmcs->exit_qualification; | ||
| 8879 | * vmcs12->guest_linear_address = evmcs->guest_linear_address; | ||
| 8880 | * | ||
| 8881 | * Not present in struct vmcs12: | ||
| 8882 | * vmcs12->exit_io_instruction_ecx = evmcs->exit_io_instruction_ecx; | ||
| 8883 | * vmcs12->exit_io_instruction_esi = evmcs->exit_io_instruction_esi; | ||
| 8884 | * vmcs12->exit_io_instruction_edi = evmcs->exit_io_instruction_edi; | ||
| 8885 | * vmcs12->exit_io_instruction_eip = evmcs->exit_io_instruction_eip; | ||
| 8886 | */ | ||
| 8887 | |||
| 8888 | return 0; | ||
| 8889 | } | ||
| 8890 | |||
| 8891 | static int copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx) | ||
| 8892 | { | ||
| 8893 | struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; | ||
| 8894 | struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs; | ||
| 8895 | |||
| 8896 | /* | ||
| 8897 | * Should not be changed by KVM: | ||
| 8898 | * | ||
| 8899 | * evmcs->host_es_selector = vmcs12->host_es_selector; | ||
| 8900 | * evmcs->host_cs_selector = vmcs12->host_cs_selector; | ||
| 8901 | * evmcs->host_ss_selector = vmcs12->host_ss_selector; | ||
| 8902 | * evmcs->host_ds_selector = vmcs12->host_ds_selector; | ||
| 8903 | * evmcs->host_fs_selector = vmcs12->host_fs_selector; | ||
| 8904 | * evmcs->host_gs_selector = vmcs12->host_gs_selector; | ||
| 8905 | * evmcs->host_tr_selector = vmcs12->host_tr_selector; | ||
| 8906 | * evmcs->host_ia32_pat = vmcs12->host_ia32_pat; | ||
| 8907 | * evmcs->host_ia32_efer = vmcs12->host_ia32_efer; | ||
| 8908 | * evmcs->host_cr0 = vmcs12->host_cr0; | ||
| 8909 | * evmcs->host_cr3 = vmcs12->host_cr3; | ||
| 8910 | * evmcs->host_cr4 = vmcs12->host_cr4; | ||
| 8911 | * evmcs->host_ia32_sysenter_esp = vmcs12->host_ia32_sysenter_esp; | ||
| 8912 | * evmcs->host_ia32_sysenter_eip = vmcs12->host_ia32_sysenter_eip; | ||
| 8913 | * evmcs->host_rip = vmcs12->host_rip; | ||
| 8914 | * evmcs->host_ia32_sysenter_cs = vmcs12->host_ia32_sysenter_cs; | ||
| 8915 | * evmcs->host_fs_base = vmcs12->host_fs_base; | ||
| 8916 | * evmcs->host_gs_base = vmcs12->host_gs_base; | ||
| 8917 | * evmcs->host_tr_base = vmcs12->host_tr_base; | ||
| 8918 | * evmcs->host_gdtr_base = vmcs12->host_gdtr_base; | ||
| 8919 | * evmcs->host_idtr_base = vmcs12->host_idtr_base; | ||
| 8920 | * evmcs->host_rsp = vmcs12->host_rsp; | ||
| 8921 | * sync_vmcs12() doesn't read these: | ||
| 8922 | * evmcs->io_bitmap_a = vmcs12->io_bitmap_a; | ||
| 8923 | * evmcs->io_bitmap_b = vmcs12->io_bitmap_b; | ||
| 8924 | * evmcs->msr_bitmap = vmcs12->msr_bitmap; | ||
| 8925 | * evmcs->ept_pointer = vmcs12->ept_pointer; | ||
| 8926 | * evmcs->xss_exit_bitmap = vmcs12->xss_exit_bitmap; | ||
| 8927 | * evmcs->vm_exit_msr_store_addr = vmcs12->vm_exit_msr_store_addr; | ||
| 8928 | * evmcs->vm_exit_msr_load_addr = vmcs12->vm_exit_msr_load_addr; | ||
| 8929 | * evmcs->vm_entry_msr_load_addr = vmcs12->vm_entry_msr_load_addr; | ||
| 8930 | * evmcs->cr3_target_value0 = vmcs12->cr3_target_value0; | ||
| 8931 | * evmcs->cr3_target_value1 = vmcs12->cr3_target_value1; | ||
| 8932 | * evmcs->cr3_target_value2 = vmcs12->cr3_target_value2; | ||
| 8933 | * evmcs->cr3_target_value3 = vmcs12->cr3_target_value3; | ||
| 8934 | * evmcs->tpr_threshold = vmcs12->tpr_threshold; | ||
| 8935 | * evmcs->virtual_processor_id = vmcs12->virtual_processor_id; | ||
| 8936 | * evmcs->exception_bitmap = vmcs12->exception_bitmap; | ||
| 8937 | * evmcs->vmcs_link_pointer = vmcs12->vmcs_link_pointer; | ||
| 8938 | * evmcs->pin_based_vm_exec_control = vmcs12->pin_based_vm_exec_control; | ||
| 8939 | * evmcs->vm_exit_controls = vmcs12->vm_exit_controls; | ||
| 8940 | * evmcs->secondary_vm_exec_control = vmcs12->secondary_vm_exec_control; | ||
| 8941 | * evmcs->page_fault_error_code_mask = | ||
| 8942 | * vmcs12->page_fault_error_code_mask; | ||
| 8943 | * evmcs->page_fault_error_code_match = | ||
| 8944 | * vmcs12->page_fault_error_code_match; | ||
| 8945 | * evmcs->cr3_target_count = vmcs12->cr3_target_count; | ||
| 8946 | * evmcs->virtual_apic_page_addr = vmcs12->virtual_apic_page_addr; | ||
| 8947 | * evmcs->tsc_offset = vmcs12->tsc_offset; | ||
| 8948 | * evmcs->guest_ia32_debugctl = vmcs12->guest_ia32_debugctl; | ||
| 8949 | * evmcs->cr0_guest_host_mask = vmcs12->cr0_guest_host_mask; | ||
| 8950 | * evmcs->cr4_guest_host_mask = vmcs12->cr4_guest_host_mask; | ||
| 8951 | * evmcs->cr0_read_shadow = vmcs12->cr0_read_shadow; | ||
| 8952 | * evmcs->cr4_read_shadow = vmcs12->cr4_read_shadow; | ||
| 8953 | * evmcs->vm_exit_msr_store_count = vmcs12->vm_exit_msr_store_count; | ||
| 8954 | * evmcs->vm_exit_msr_load_count = vmcs12->vm_exit_msr_load_count; | ||
| 8955 | * evmcs->vm_entry_msr_load_count = vmcs12->vm_entry_msr_load_count; | ||
| 8956 | * | ||
| 8957 | * Not present in struct vmcs12: | ||
| 8958 | * evmcs->exit_io_instruction_ecx = vmcs12->exit_io_instruction_ecx; | ||
| 8959 | * evmcs->exit_io_instruction_esi = vmcs12->exit_io_instruction_esi; | ||
| 8960 | * evmcs->exit_io_instruction_edi = vmcs12->exit_io_instruction_edi; | ||
| 8961 | * evmcs->exit_io_instruction_eip = vmcs12->exit_io_instruction_eip; | ||
| 8962 | */ | ||
| 8963 | |||
| 8964 | evmcs->guest_es_selector = vmcs12->guest_es_selector; | ||
| 8965 | evmcs->guest_cs_selector = vmcs12->guest_cs_selector; | ||
| 8966 | evmcs->guest_ss_selector = vmcs12->guest_ss_selector; | ||
| 8967 | evmcs->guest_ds_selector = vmcs12->guest_ds_selector; | ||
| 8968 | evmcs->guest_fs_selector = vmcs12->guest_fs_selector; | ||
| 8969 | evmcs->guest_gs_selector = vmcs12->guest_gs_selector; | ||
| 8970 | evmcs->guest_ldtr_selector = vmcs12->guest_ldtr_selector; | ||
| 8971 | evmcs->guest_tr_selector = vmcs12->guest_tr_selector; | ||
| 8972 | |||
| 8973 | evmcs->guest_es_limit = vmcs12->guest_es_limit; | ||
| 8974 | evmcs->guest_cs_limit = vmcs12->guest_cs_limit; | ||
| 8975 | evmcs->guest_ss_limit = vmcs12->guest_ss_limit; | ||
| 8976 | evmcs->guest_ds_limit = vmcs12->guest_ds_limit; | ||
| 8977 | evmcs->guest_fs_limit = vmcs12->guest_fs_limit; | ||
| 8978 | evmcs->guest_gs_limit = vmcs12->guest_gs_limit; | ||
| 8979 | evmcs->guest_ldtr_limit = vmcs12->guest_ldtr_limit; | ||
| 8980 | evmcs->guest_tr_limit = vmcs12->guest_tr_limit; | ||
| 8981 | evmcs->guest_gdtr_limit = vmcs12->guest_gdtr_limit; | ||
| 8982 | evmcs->guest_idtr_limit = vmcs12->guest_idtr_limit; | ||
| 8983 | |||
| 8984 | evmcs->guest_es_ar_bytes = vmcs12->guest_es_ar_bytes; | ||
| 8985 | evmcs->guest_cs_ar_bytes = vmcs12->guest_cs_ar_bytes; | ||
| 8986 | evmcs->guest_ss_ar_bytes = vmcs12->guest_ss_ar_bytes; | ||
| 8987 | evmcs->guest_ds_ar_bytes = vmcs12->guest_ds_ar_bytes; | ||
| 8988 | evmcs->guest_fs_ar_bytes = vmcs12->guest_fs_ar_bytes; | ||
| 8989 | evmcs->guest_gs_ar_bytes = vmcs12->guest_gs_ar_bytes; | ||
| 8990 | evmcs->guest_ldtr_ar_bytes = vmcs12->guest_ldtr_ar_bytes; | ||
| 8991 | evmcs->guest_tr_ar_bytes = vmcs12->guest_tr_ar_bytes; | ||
| 8992 | |||
| 8993 | evmcs->guest_es_base = vmcs12->guest_es_base; | ||
| 8994 | evmcs->guest_cs_base = vmcs12->guest_cs_base; | ||
| 8995 | evmcs->guest_ss_base = vmcs12->guest_ss_base; | ||
| 8996 | evmcs->guest_ds_base = vmcs12->guest_ds_base; | ||
| 8997 | evmcs->guest_fs_base = vmcs12->guest_fs_base; | ||
| 8998 | evmcs->guest_gs_base = vmcs12->guest_gs_base; | ||
| 8999 | evmcs->guest_ldtr_base = vmcs12->guest_ldtr_base; | ||
| 9000 | evmcs->guest_tr_base = vmcs12->guest_tr_base; | ||
| 9001 | evmcs->guest_gdtr_base = vmcs12->guest_gdtr_base; | ||
| 9002 | evmcs->guest_idtr_base = vmcs12->guest_idtr_base; | ||
| 9003 | |||
| 9004 | evmcs->guest_ia32_pat = vmcs12->guest_ia32_pat; | ||
| 9005 | evmcs->guest_ia32_efer = vmcs12->guest_ia32_efer; | ||
| 9006 | |||
| 9007 | evmcs->guest_pdptr0 = vmcs12->guest_pdptr0; | ||
| 9008 | evmcs->guest_pdptr1 = vmcs12->guest_pdptr1; | ||
| 9009 | evmcs->guest_pdptr2 = vmcs12->guest_pdptr2; | ||
| 9010 | evmcs->guest_pdptr3 = vmcs12->guest_pdptr3; | ||
| 9011 | |||
| 9012 | evmcs->guest_pending_dbg_exceptions = | ||
| 9013 | vmcs12->guest_pending_dbg_exceptions; | ||
| 9014 | evmcs->guest_sysenter_esp = vmcs12->guest_sysenter_esp; | ||
| 9015 | evmcs->guest_sysenter_eip = vmcs12->guest_sysenter_eip; | ||
| 9016 | |||
| 9017 | evmcs->guest_activity_state = vmcs12->guest_activity_state; | ||
| 9018 | evmcs->guest_sysenter_cs = vmcs12->guest_sysenter_cs; | ||
| 9019 | |||
| 9020 | evmcs->guest_cr0 = vmcs12->guest_cr0; | ||
| 9021 | evmcs->guest_cr3 = vmcs12->guest_cr3; | ||
| 9022 | evmcs->guest_cr4 = vmcs12->guest_cr4; | ||
| 9023 | evmcs->guest_dr7 = vmcs12->guest_dr7; | ||
| 9024 | |||
| 9025 | evmcs->guest_physical_address = vmcs12->guest_physical_address; | ||
| 9026 | |||
| 9027 | evmcs->vm_instruction_error = vmcs12->vm_instruction_error; | ||
| 9028 | evmcs->vm_exit_reason = vmcs12->vm_exit_reason; | ||
| 9029 | evmcs->vm_exit_intr_info = vmcs12->vm_exit_intr_info; | ||
| 9030 | evmcs->vm_exit_intr_error_code = vmcs12->vm_exit_intr_error_code; | ||
| 9031 | evmcs->idt_vectoring_info_field = vmcs12->idt_vectoring_info_field; | ||
| 9032 | evmcs->idt_vectoring_error_code = vmcs12->idt_vectoring_error_code; | ||
| 9033 | evmcs->vm_exit_instruction_len = vmcs12->vm_exit_instruction_len; | ||
| 9034 | evmcs->vmx_instruction_info = vmcs12->vmx_instruction_info; | ||
| 9035 | |||
| 9036 | evmcs->exit_qualification = vmcs12->exit_qualification; | ||
| 9037 | |||
| 9038 | evmcs->guest_linear_address = vmcs12->guest_linear_address; | ||
| 9039 | evmcs->guest_rsp = vmcs12->guest_rsp; | ||
| 9040 | evmcs->guest_rflags = vmcs12->guest_rflags; | ||
| 9041 | |||
| 9042 | evmcs->guest_interruptibility_info = | ||
| 9043 | vmcs12->guest_interruptibility_info; | ||
| 9044 | evmcs->cpu_based_vm_exec_control = vmcs12->cpu_based_vm_exec_control; | ||
| 9045 | evmcs->vm_entry_controls = vmcs12->vm_entry_controls; | ||
| 9046 | evmcs->vm_entry_intr_info_field = vmcs12->vm_entry_intr_info_field; | ||
| 9047 | evmcs->vm_entry_exception_error_code = | ||
| 9048 | vmcs12->vm_entry_exception_error_code; | ||
| 9049 | evmcs->vm_entry_instruction_len = vmcs12->vm_entry_instruction_len; | ||
| 9050 | |||
| 9051 | evmcs->guest_rip = vmcs12->guest_rip; | ||
| 9052 | |||
| 9053 | evmcs->guest_bndcfgs = vmcs12->guest_bndcfgs; | ||
| 9054 | |||
| 9055 | return 0; | ||
| 9056 | } | ||
| 9057 | |||
| 9058 | /* | ||
| 9059 | * Copy the writable VMCS shadow fields back to the VMCS12, in case | ||
| 9060 | * they have been modified by the L1 guest. Note that the "read-only" | ||
| 9061 | * VM-exit information fields are actually writable if the vCPU is | ||
| 9062 | * configured to support "VMWRITE to any supported field in the VMCS." | ||
| 9063 | */ | ||
| 9064 | static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) | ||
| 9065 | { | ||
| 9066 | const u16 *fields[] = { | ||
| 9067 | shadow_read_write_fields, | ||
| 9068 | shadow_read_only_fields | ||
| 9069 | }; | ||
| 9070 | const int max_fields[] = { | ||
| 9071 | max_shadow_read_write_fields, | ||
| 9072 | max_shadow_read_only_fields | ||
| 9073 | }; | ||
| 9074 | int i, q; | ||
| 9075 | unsigned long field; | ||
| 9076 | u64 field_value; | ||
| 9077 | struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; | ||
| 9078 | |||
| 9079 | preempt_disable(); | ||
| 9080 | |||
| 9081 | vmcs_load(shadow_vmcs); | ||
| 9082 | |||
| 9083 | for (q = 0; q < ARRAY_SIZE(fields); q++) { | ||
| 9084 | for (i = 0; i < max_fields[q]; i++) { | ||
| 9085 | field = fields[q][i]; | ||
| 9086 | field_value = __vmcs_readl(field); | ||
| 9087 | vmcs12_write_any(get_vmcs12(&vmx->vcpu), field, field_value); | ||
| 9088 | } | ||
| 9089 | /* | ||
| 9090 | * Skip the VM-exit information fields if they are read-only. | ||
| 9091 | */ | ||
| 9092 | if (!nested_cpu_has_vmwrite_any_field(&vmx->vcpu)) | ||
| 9093 | break; | ||
| 9094 | } | ||
| 9095 | |||
| 9096 | vmcs_clear(shadow_vmcs); | ||
| 9097 | vmcs_load(vmx->loaded_vmcs->vmcs); | ||
| 9098 | |||
| 9099 | preempt_enable(); | ||
| 9100 | } | ||
| 9101 | |||
| 9102 | static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) | ||
| 9103 | { | ||
| 9104 | const u16 *fields[] = { | ||
| 9105 | shadow_read_write_fields, | ||
| 9106 | shadow_read_only_fields | ||
| 9107 | }; | ||
| 9108 | const int max_fields[] = { | ||
| 9109 | max_shadow_read_write_fields, | ||
| 9110 | max_shadow_read_only_fields | ||
| 9111 | }; | ||
| 9112 | int i, q; | ||
| 9113 | unsigned long field; | ||
| 9114 | u64 field_value = 0; | ||
| 9115 | struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; | ||
| 9116 | |||
| 9117 | vmcs_load(shadow_vmcs); | ||
| 9118 | |||
| 9119 | for (q = 0; q < ARRAY_SIZE(fields); q++) { | ||
| 9120 | for (i = 0; i < max_fields[q]; i++) { | ||
| 9121 | field = fields[q][i]; | ||
| 9122 | vmcs12_read_any(get_vmcs12(&vmx->vcpu), field, &field_value); | ||
| 9123 | __vmcs_writel(field, field_value); | ||
| 9124 | } | ||
| 9125 | } | ||
| 9126 | |||
| 9127 | vmcs_clear(shadow_vmcs); | ||
| 9128 | vmcs_load(vmx->loaded_vmcs->vmcs); | ||
| 9129 | } | ||
| 9130 | |||
| 9131 | static int handle_vmread(struct kvm_vcpu *vcpu) | ||
| 9132 | { | ||
| 9133 | unsigned long field; | ||
| 9134 | u64 field_value; | ||
| 9135 | unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); | ||
| 9136 | u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); | ||
| 9137 | gva_t gva = 0; | ||
| 9138 | struct vmcs12 *vmcs12; | ||
| 9139 | |||
| 9140 | if (!nested_vmx_check_permission(vcpu)) | ||
| 9141 | return 1; | ||
| 9142 | |||
| 9143 | if (to_vmx(vcpu)->nested.current_vmptr == -1ull) | ||
| 9144 | return nested_vmx_failInvalid(vcpu); | ||
| 9145 | |||
| 9146 | if (!is_guest_mode(vcpu)) | ||
| 9147 | vmcs12 = get_vmcs12(vcpu); | ||
| 9148 | else { | ||
| 9149 | /* | ||
| 9150 | * When vmcs->vmcs_link_pointer is -1ull, any VMREAD | ||
| 9151 | * to shadowed-field sets the ALU flags for VMfailInvalid. | ||
| 9152 | */ | ||
| 9153 | if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull) | ||
| 9154 | return nested_vmx_failInvalid(vcpu); | ||
| 9155 | vmcs12 = get_shadow_vmcs12(vcpu); | ||
| 9156 | } | ||
| 9157 | |||
| 9158 | /* Decode instruction info and find the field to read */ | ||
| 9159 | field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); | ||
| 9160 | /* Read the field, zero-extended to a u64 field_value */ | ||
| 9161 | if (vmcs12_read_any(vmcs12, field, &field_value) < 0) | ||
| 9162 | return nested_vmx_failValid(vcpu, | ||
| 9163 | VMXERR_UNSUPPORTED_VMCS_COMPONENT); | ||
| 9164 | |||
| 9165 | /* | ||
| 9166 | * Now copy part of this value to register or memory, as requested. | ||
| 9167 | * Note that the number of bits actually copied is 32 or 64 depending | ||
| 9168 | * on the guest's mode (32 or 64 bit), not on the given field's length. | ||
| 9169 | */ | ||
| 9170 | if (vmx_instruction_info & (1u << 10)) { | ||
| 9171 | kvm_register_writel(vcpu, (((vmx_instruction_info) >> 3) & 0xf), | ||
| 9172 | field_value); | ||
| 9173 | } else { | ||
| 9174 | if (get_vmx_mem_address(vcpu, exit_qualification, | ||
| 9175 | vmx_instruction_info, true, &gva)) | ||
| 9176 | return 1; | ||
| 9177 | /* _system ok, nested_vmx_check_permission has verified cpl=0 */ | ||
| 9178 | kvm_write_guest_virt_system(vcpu, gva, &field_value, | ||
| 9179 | (is_long_mode(vcpu) ? 8 : 4), NULL); | ||
| 9180 | } | ||
| 9181 | |||
| 9182 | return nested_vmx_succeed(vcpu); | ||
| 9183 | } | ||
| 9184 | |||
| 9185 | |||
| 9186 | static int handle_vmwrite(struct kvm_vcpu *vcpu) | ||
| 9187 | { | ||
| 9188 | unsigned long field; | ||
| 9189 | gva_t gva; | ||
| 9190 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 9191 | unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); | ||
| 9192 | u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); | ||
| 9193 | |||
| 9194 | /* The value to write might be 32 or 64 bits, depending on L1's long | ||
| 9195 | * mode, and eventually we need to write that into a field of several | ||
| 9196 | * possible lengths. The code below first zero-extends the value to 64 | ||
| 9197 | * bit (field_value), and then copies only the appropriate number of | ||
| 9198 | * bits into the vmcs12 field. | ||
| 9199 | */ | ||
| 9200 | u64 field_value = 0; | ||
| 9201 | struct x86_exception e; | ||
| 9202 | struct vmcs12 *vmcs12; | ||
| 9203 | |||
| 9204 | if (!nested_vmx_check_permission(vcpu)) | ||
| 9205 | return 1; | ||
| 9206 | |||
| 9207 | if (vmx->nested.current_vmptr == -1ull) | ||
| 9208 | return nested_vmx_failInvalid(vcpu); | ||
| 9209 | |||
| 9210 | if (vmx_instruction_info & (1u << 10)) | ||
| 9211 | field_value = kvm_register_readl(vcpu, | ||
| 9212 | (((vmx_instruction_info) >> 3) & 0xf)); | ||
| 9213 | else { | ||
| 9214 | if (get_vmx_mem_address(vcpu, exit_qualification, | ||
| 9215 | vmx_instruction_info, false, &gva)) | ||
| 9216 | return 1; | ||
| 9217 | if (kvm_read_guest_virt(vcpu, gva, &field_value, | ||
| 9218 | (is_64_bit_mode(vcpu) ? 8 : 4), &e)) { | ||
| 9219 | kvm_inject_page_fault(vcpu, &e); | ||
| 9220 | return 1; | ||
| 9221 | } | ||
| 9222 | } | ||
| 9223 | |||
| 9224 | |||
| 9225 | field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); | ||
| 9226 | /* | ||
| 9227 | * If the vCPU supports "VMWRITE to any supported field in the | ||
| 9228 | * VMCS," then the "read-only" fields are actually read/write. | ||
| 9229 | */ | ||
| 9230 | if (vmcs_field_readonly(field) && | ||
| 9231 | !nested_cpu_has_vmwrite_any_field(vcpu)) | ||
| 9232 | return nested_vmx_failValid(vcpu, | ||
| 9233 | VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT); | ||
| 9234 | |||
| 9235 | if (!is_guest_mode(vcpu)) | ||
| 9236 | vmcs12 = get_vmcs12(vcpu); | ||
| 9237 | else { | ||
| 9238 | /* | ||
| 9239 | * When vmcs->vmcs_link_pointer is -1ull, any VMWRITE | ||
| 9240 | * to shadowed-field sets the ALU flags for VMfailInvalid. | ||
| 9241 | */ | ||
| 9242 | if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull) | ||
| 9243 | return nested_vmx_failInvalid(vcpu); | ||
| 9244 | vmcs12 = get_shadow_vmcs12(vcpu); | ||
| 9245 | } | ||
| 9246 | |||
| 9247 | if (vmcs12_write_any(vmcs12, field, field_value) < 0) | ||
| 9248 | return nested_vmx_failValid(vcpu, | ||
| 9249 | VMXERR_UNSUPPORTED_VMCS_COMPONENT); | ||
| 9250 | |||
| 9251 | /* | ||
| 9252 | * Do not track vmcs12 dirty-state if in guest-mode | ||
| 9253 | * as we actually dirty shadow vmcs12 instead of vmcs12. | ||
| 9254 | */ | ||
| 9255 | if (!is_guest_mode(vcpu)) { | ||
| 9256 | switch (field) { | ||
| 9257 | #define SHADOW_FIELD_RW(x) case x: | ||
| 9258 | #include "vmx_shadow_fields.h" | ||
| 9259 | /* | ||
| 9260 | * The fields that can be updated by L1 without a vmexit are | ||
| 9261 | * always updated in the vmcs02, the others go down the slow | ||
| 9262 | * path of prepare_vmcs02. | ||
| 9263 | */ | ||
| 9264 | break; | ||
| 9265 | default: | ||
| 9266 | vmx->nested.dirty_vmcs12 = true; | ||
| 9267 | break; | ||
| 9268 | } | ||
| 9269 | } | ||
| 9270 | |||
| 9271 | return nested_vmx_succeed(vcpu); | ||
| 9272 | } | ||
| 9273 | |||
| 9274 | static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr) | ||
| 9275 | { | ||
| 9276 | vmx->nested.current_vmptr = vmptr; | ||
| 9277 | if (enable_shadow_vmcs) { | ||
| 9278 | vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL, | ||
| 9279 | SECONDARY_EXEC_SHADOW_VMCS); | ||
| 9280 | vmcs_write64(VMCS_LINK_POINTER, | ||
| 9281 | __pa(vmx->vmcs01.shadow_vmcs)); | ||
| 9282 | vmx->nested.need_vmcs12_sync = true; | ||
| 9283 | } | ||
| 9284 | vmx->nested.dirty_vmcs12 = true; | ||
| 9285 | } | ||
| 9286 | |||
| 9287 | /* Emulate the VMPTRLD instruction */ | ||
| 9288 | static int handle_vmptrld(struct kvm_vcpu *vcpu) | ||
| 9289 | { | ||
| 9290 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 9291 | gpa_t vmptr; | ||
| 9292 | |||
| 9293 | if (!nested_vmx_check_permission(vcpu)) | ||
| 9294 | return 1; | ||
| 9295 | |||
| 9296 | if (nested_vmx_get_vmptr(vcpu, &vmptr)) | ||
| 9297 | return 1; | ||
| 9298 | |||
| 9299 | if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) | ||
| 9300 | return nested_vmx_failValid(vcpu, | ||
| 9301 | VMXERR_VMPTRLD_INVALID_ADDRESS); | ||
| 9302 | |||
| 9303 | if (vmptr == vmx->nested.vmxon_ptr) | ||
| 9304 | return nested_vmx_failValid(vcpu, | ||
| 9305 | VMXERR_VMPTRLD_VMXON_POINTER); | ||
| 9306 | |||
| 9307 | /* Forbid normal VMPTRLD if Enlightened version was used */ | ||
| 9308 | if (vmx->nested.hv_evmcs) | ||
| 9309 | return 1; | ||
| 9310 | |||
| 9311 | if (vmx->nested.current_vmptr != vmptr) { | ||
| 9312 | struct vmcs12 *new_vmcs12; | ||
| 9313 | struct page *page; | ||
| 9314 | page = kvm_vcpu_gpa_to_page(vcpu, vmptr); | ||
| 9315 | if (is_error_page(page)) | ||
| 9316 | return nested_vmx_failInvalid(vcpu); | ||
| 9317 | |||
| 9318 | new_vmcs12 = kmap(page); | ||
| 9319 | if (new_vmcs12->hdr.revision_id != VMCS12_REVISION || | ||
| 9320 | (new_vmcs12->hdr.shadow_vmcs && | ||
| 9321 | !nested_cpu_has_vmx_shadow_vmcs(vcpu))) { | ||
| 9322 | kunmap(page); | ||
| 9323 | kvm_release_page_clean(page); | ||
| 9324 | return nested_vmx_failValid(vcpu, | ||
| 9325 | VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); | ||
| 9326 | } | ||
| 9327 | |||
| 9328 | nested_release_vmcs12(vcpu); | ||
| 9329 | |||
| 9330 | /* | ||
| 9331 | * Load VMCS12 from guest memory since it is not already | ||
| 9332 | * cached. | ||
| 9333 | */ | ||
| 9334 | memcpy(vmx->nested.cached_vmcs12, new_vmcs12, VMCS12_SIZE); | ||
| 9335 | kunmap(page); | ||
| 9336 | kvm_release_page_clean(page); | ||
| 9337 | |||
| 9338 | set_current_vmptr(vmx, vmptr); | ||
| 9339 | } | ||
| 9340 | |||
| 9341 | return nested_vmx_succeed(vcpu); | ||
| 9342 | } | ||
| 9343 | |||
| 9344 | /* | ||
| 9345 | * This is an equivalent of the nested hypervisor executing the vmptrld | ||
| 9346 | * instruction. | ||
| 9347 | */ | ||
| 9348 | static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu, | ||
| 9349 | bool from_launch) | ||
| 9350 | { | ||
| 9351 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 9352 | struct hv_vp_assist_page assist_page; | ||
| 9353 | |||
| 9354 | if (likely(!vmx->nested.enlightened_vmcs_enabled)) | ||
| 9355 | return 1; | ||
| 9356 | |||
| 9357 | if (unlikely(!kvm_hv_get_assist_page(vcpu, &assist_page))) | ||
| 9358 | return 1; | ||
| 9359 | |||
| 9360 | if (unlikely(!assist_page.enlighten_vmentry)) | ||
| 9361 | return 1; | ||
| 9362 | |||
| 9363 | if (unlikely(assist_page.current_nested_vmcs != | ||
| 9364 | vmx->nested.hv_evmcs_vmptr)) { | ||
| 9365 | |||
| 9366 | if (!vmx->nested.hv_evmcs) | ||
| 9367 | vmx->nested.current_vmptr = -1ull; | ||
| 9368 | |||
| 9369 | nested_release_evmcs(vcpu); | ||
| 9370 | |||
| 9371 | vmx->nested.hv_evmcs_page = kvm_vcpu_gpa_to_page( | ||
| 9372 | vcpu, assist_page.current_nested_vmcs); | ||
| 9373 | |||
| 9374 | if (unlikely(is_error_page(vmx->nested.hv_evmcs_page))) | ||
| 9375 | return 0; | ||
| 9376 | |||
| 9377 | vmx->nested.hv_evmcs = kmap(vmx->nested.hv_evmcs_page); | ||
| 9378 | |||
| 9379 | /* | ||
| 9380 | * Currently, KVM only supports eVMCS version 1 | ||
| 9381 | * (== KVM_EVMCS_VERSION) and thus we expect guest to set this | ||
| 9382 | * value to first u32 field of eVMCS which should specify eVMCS | ||
| 9383 | * VersionNumber. | ||
| 9384 | * | ||
| 9385 | * Guest should be aware of supported eVMCS versions by host by | ||
| 9386 | * examining CPUID.0x4000000A.EAX[0:15]. Host userspace VMM is | ||
| 9387 | * expected to set this CPUID leaf according to the value | ||
| 9388 | * returned in vmcs_version from nested_enable_evmcs(). | ||
| 9389 | * | ||
| 9390 | * However, it turns out that Microsoft Hyper-V fails to comply | ||
| 9391 | * to their own invented interface: When Hyper-V use eVMCS, it | ||
| 9392 | * just sets first u32 field of eVMCS to revision_id specified | ||
| 9393 | * in MSR_IA32_VMX_BASIC. Instead of used eVMCS version number | ||
| 9394 | * which is one of the supported versions specified in | ||
| 9395 | * CPUID.0x4000000A.EAX[0:15]. | ||
| 9396 | * | ||
| 9397 | * To overcome Hyper-V bug, we accept here either a supported | ||
| 9398 | * eVMCS version or VMCS12 revision_id as valid values for first | ||
| 9399 | * u32 field of eVMCS. | ||
| 9400 | */ | ||
| 9401 | if ((vmx->nested.hv_evmcs->revision_id != KVM_EVMCS_VERSION) && | ||
| 9402 | (vmx->nested.hv_evmcs->revision_id != VMCS12_REVISION)) { | ||
| 9403 | nested_release_evmcs(vcpu); | ||
| 9404 | return 0; | ||
| 9405 | } | ||
| 9406 | |||
| 9407 | vmx->nested.dirty_vmcs12 = true; | ||
| 9408 | /* | ||
| 9409 | * As we keep L2 state for one guest only 'hv_clean_fields' mask | ||
| 9410 | * can't be used when we switch between them. Reset it here for | ||
| 9411 | * simplicity. | ||
| 9412 | */ | ||
| 9413 | vmx->nested.hv_evmcs->hv_clean_fields &= | ||
| 9414 | ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; | ||
| 9415 | vmx->nested.hv_evmcs_vmptr = assist_page.current_nested_vmcs; | ||
| 9416 | |||
| 9417 | /* | ||
| 9418 | * Unlike normal vmcs12, enlightened vmcs12 is not fully | ||
| 9419 | * reloaded from guest's memory (read only fields, fields not | ||
| 9420 | * present in struct hv_enlightened_vmcs, ...). Make sure there | ||
| 9421 | * are no leftovers. | ||
| 9422 | */ | ||
| 9423 | if (from_launch) { | ||
| 9424 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); | ||
| 9425 | memset(vmcs12, 0, sizeof(*vmcs12)); | ||
| 9426 | vmcs12->hdr.revision_id = VMCS12_REVISION; | ||
| 9427 | } | ||
| 9428 | |||
| 9429 | } | ||
| 9430 | return 1; | ||
| 9431 | } | ||
| 9432 | |||
| 9433 | /* Emulate the VMPTRST instruction */ | ||
| 9434 | static int handle_vmptrst(struct kvm_vcpu *vcpu) | ||
| 9435 | { | ||
| 9436 | unsigned long exit_qual = vmcs_readl(EXIT_QUALIFICATION); | ||
| 9437 | u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); | ||
| 9438 | gpa_t current_vmptr = to_vmx(vcpu)->nested.current_vmptr; | ||
| 9439 | struct x86_exception e; | ||
| 9440 | gva_t gva; | ||
| 9441 | |||
| 9442 | if (!nested_vmx_check_permission(vcpu)) | ||
| 9443 | return 1; | ||
| 9444 | |||
| 9445 | if (unlikely(to_vmx(vcpu)->nested.hv_evmcs)) | ||
| 9446 | return 1; | ||
| 9447 | |||
| 9448 | if (get_vmx_mem_address(vcpu, exit_qual, instr_info, true, &gva)) | ||
| 9449 | return 1; | ||
| 9450 | /* *_system ok, nested_vmx_check_permission has verified cpl=0 */ | ||
| 9451 | if (kvm_write_guest_virt_system(vcpu, gva, (void *)¤t_vmptr, | ||
| 9452 | sizeof(gpa_t), &e)) { | ||
| 9453 | kvm_inject_page_fault(vcpu, &e); | ||
| 9454 | return 1; | ||
| 9455 | } | ||
| 9456 | return nested_vmx_succeed(vcpu); | ||
| 9457 | } | ||
| 9458 | |||
| 9459 | /* Emulate the INVEPT instruction */ | ||
| 9460 | static int handle_invept(struct kvm_vcpu *vcpu) | ||
| 9461 | { | ||
| 9462 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 9463 | u32 vmx_instruction_info, types; | ||
| 9464 | unsigned long type; | ||
| 9465 | gva_t gva; | ||
| 9466 | struct x86_exception e; | ||
| 9467 | struct { | ||
| 9468 | u64 eptp, gpa; | ||
| 9469 | } operand; | ||
| 9470 | |||
| 9471 | if (!(vmx->nested.msrs.secondary_ctls_high & | ||
| 9472 | SECONDARY_EXEC_ENABLE_EPT) || | ||
| 9473 | !(vmx->nested.msrs.ept_caps & VMX_EPT_INVEPT_BIT)) { | ||
| 9474 | kvm_queue_exception(vcpu, UD_VECTOR); | ||
| 9475 | return 1; | ||
| 9476 | } | ||
| 9477 | |||
| 9478 | if (!nested_vmx_check_permission(vcpu)) | ||
| 9479 | return 1; | ||
| 9480 | |||
| 9481 | vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); | ||
| 9482 | type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf); | ||
| 9483 | |||
| 9484 | types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6; | ||
| 9485 | |||
| 9486 | if (type >= 32 || !(types & (1 << type))) | ||
| 9487 | return nested_vmx_failValid(vcpu, | ||
| 9488 | VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); | ||
| 9489 | |||
| 9490 | /* According to the Intel VMX instruction reference, the memory | ||
| 9491 | * operand is read even if it isn't needed (e.g., for type==global) | ||
| 9492 | */ | ||
| 9493 | if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), | ||
| 9494 | vmx_instruction_info, false, &gva)) | ||
| 9495 | return 1; | ||
| 9496 | if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) { | ||
| 9497 | kvm_inject_page_fault(vcpu, &e); | ||
| 9498 | return 1; | ||
| 9499 | } | ||
| 9500 | |||
| 9501 | switch (type) { | ||
| 9502 | case VMX_EPT_EXTENT_GLOBAL: | ||
| 9503 | /* | ||
| 9504 | * TODO: track mappings and invalidate | ||
| 9505 | * single context requests appropriately | ||
| 9506 | */ | ||
| 9507 | case VMX_EPT_EXTENT_CONTEXT: | ||
| 9508 | kvm_mmu_sync_roots(vcpu); | ||
| 9509 | kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); | ||
| 9510 | break; | ||
| 9511 | default: | ||
| 9512 | BUG_ON(1); | ||
| 9513 | break; | ||
| 9514 | } | ||
| 9515 | |||
| 9516 | return nested_vmx_succeed(vcpu); | ||
| 9517 | } | ||
| 9518 | |||
| 9519 | static u16 nested_get_vpid02(struct kvm_vcpu *vcpu) | ||
| 9520 | { | ||
| 9521 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 9522 | |||
| 9523 | return vmx->nested.vpid02 ? vmx->nested.vpid02 : vmx->vpid; | ||
| 9524 | } | ||
| 9525 | |||
| 9526 | static int handle_invvpid(struct kvm_vcpu *vcpu) | ||
| 9527 | { | ||
| 9528 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 9529 | u32 vmx_instruction_info; | ||
| 9530 | unsigned long type, types; | ||
| 9531 | gva_t gva; | ||
| 9532 | struct x86_exception e; | ||
| 9533 | struct { | ||
| 9534 | u64 vpid; | ||
| 9535 | u64 gla; | ||
| 9536 | } operand; | ||
| 9537 | u16 vpid02; | ||
| 9538 | |||
| 9539 | if (!(vmx->nested.msrs.secondary_ctls_high & | ||
| 9540 | SECONDARY_EXEC_ENABLE_VPID) || | ||
| 9541 | !(vmx->nested.msrs.vpid_caps & VMX_VPID_INVVPID_BIT)) { | ||
| 9542 | kvm_queue_exception(vcpu, UD_VECTOR); | ||
| 9543 | return 1; | ||
| 9544 | } | ||
| 9545 | |||
| 9546 | if (!nested_vmx_check_permission(vcpu)) | ||
| 9547 | return 1; | ||
| 9548 | |||
| 9549 | vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); | ||
| 9550 | type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf); | ||
| 9551 | |||
| 9552 | types = (vmx->nested.msrs.vpid_caps & | ||
| 9553 | VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8; | ||
| 9554 | |||
| 9555 | if (type >= 32 || !(types & (1 << type))) | ||
| 9556 | return nested_vmx_failValid(vcpu, | ||
| 9557 | VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); | ||
| 9558 | |||
| 9559 | /* according to the intel vmx instruction reference, the memory | ||
| 9560 | * operand is read even if it isn't needed (e.g., for type==global) | ||
| 9561 | */ | ||
| 9562 | if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), | ||
| 9563 | vmx_instruction_info, false, &gva)) | ||
| 9564 | return 1; | ||
| 9565 | if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) { | ||
| 9566 | kvm_inject_page_fault(vcpu, &e); | ||
| 9567 | return 1; | ||
| 9568 | } | ||
| 9569 | if (operand.vpid >> 16) | ||
| 9570 | return nested_vmx_failValid(vcpu, | ||
| 9571 | VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); | ||
| 9572 | |||
| 9573 | vpid02 = nested_get_vpid02(vcpu); | ||
| 9574 | switch (type) { | ||
| 9575 | case VMX_VPID_EXTENT_INDIVIDUAL_ADDR: | ||
| 9576 | if (!operand.vpid || | ||
| 9577 | is_noncanonical_address(operand.gla, vcpu)) | ||
| 9578 | return nested_vmx_failValid(vcpu, | ||
| 9579 | VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); | ||
| 9580 | if (cpu_has_vmx_invvpid_individual_addr()) { | ||
| 9581 | __invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR, | ||
| 9582 | vpid02, operand.gla); | ||
| 9583 | } else | ||
| 9584 | __vmx_flush_tlb(vcpu, vpid02, false); | ||
| 9585 | break; | ||
| 9586 | case VMX_VPID_EXTENT_SINGLE_CONTEXT: | ||
| 9587 | case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL: | ||
| 9588 | if (!operand.vpid) | ||
| 9589 | return nested_vmx_failValid(vcpu, | ||
| 9590 | VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); | ||
| 9591 | __vmx_flush_tlb(vcpu, vpid02, false); | ||
| 9592 | break; | ||
| 9593 | case VMX_VPID_EXTENT_ALL_CONTEXT: | ||
| 9594 | __vmx_flush_tlb(vcpu, vpid02, false); | ||
| 9595 | break; | ||
| 9596 | default: | ||
| 9597 | WARN_ON_ONCE(1); | ||
| 9598 | return kvm_skip_emulated_instruction(vcpu); | ||
| 9599 | } | ||
| 9600 | |||
| 9601 | return nested_vmx_succeed(vcpu); | ||
| 9602 | } | ||
| 9603 | |||
| 9604 | static int handle_invpcid(struct kvm_vcpu *vcpu) | ||
| 9605 | { | ||
| 9606 | u32 vmx_instruction_info; | ||
| 9607 | unsigned long type; | ||
| 9608 | bool pcid_enabled; | ||
| 9609 | gva_t gva; | ||
| 9610 | struct x86_exception e; | ||
| 9611 | unsigned i; | ||
| 9612 | unsigned long roots_to_free = 0; | ||
| 9613 | struct { | ||
| 9614 | u64 pcid; | ||
| 9615 | u64 gla; | ||
| 9616 | } operand; | ||
| 9617 | |||
| 9618 | if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) { | ||
| 9619 | kvm_queue_exception(vcpu, UD_VECTOR); | ||
| 9620 | return 1; | ||
| 9621 | } | ||
| 9622 | |||
| 9623 | vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); | ||
| 9624 | type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf); | ||
| 9625 | |||
| 9626 | if (type > 3) { | ||
| 9627 | kvm_inject_gp(vcpu, 0); | ||
| 9628 | return 1; | ||
| 9629 | } | ||
| 9630 | |||
| 9631 | /* According to the Intel instruction reference, the memory operand | ||
| 9632 | * is read even if it isn't needed (e.g., for type==all) | ||
| 9633 | */ | ||
| 9634 | if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), | ||
| 9635 | vmx_instruction_info, false, &gva)) | ||
| 9636 | return 1; | ||
| 9637 | |||
| 9638 | if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) { | ||
| 9639 | kvm_inject_page_fault(vcpu, &e); | ||
| 9640 | return 1; | ||
| 9641 | } | ||
| 9642 | |||
| 9643 | if (operand.pcid >> 12 != 0) { | ||
| 9644 | kvm_inject_gp(vcpu, 0); | ||
| 9645 | return 1; | ||
| 9646 | } | ||
| 9647 | |||
| 9648 | pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE); | ||
| 9649 | |||
| 9650 | switch (type) { | ||
| 9651 | case INVPCID_TYPE_INDIV_ADDR: | ||
| 9652 | if ((!pcid_enabled && (operand.pcid != 0)) || | ||
| 9653 | is_noncanonical_address(operand.gla, vcpu)) { | ||
| 9654 | kvm_inject_gp(vcpu, 0); | ||
| 9655 | return 1; | ||
| 9656 | } | ||
| 9657 | kvm_mmu_invpcid_gva(vcpu, operand.gla, operand.pcid); | ||
| 9658 | return kvm_skip_emulated_instruction(vcpu); | ||
| 9659 | |||
| 9660 | case INVPCID_TYPE_SINGLE_CTXT: | ||
| 9661 | if (!pcid_enabled && (operand.pcid != 0)) { | ||
| 9662 | kvm_inject_gp(vcpu, 0); | ||
| 9663 | return 1; | ||
| 9664 | } | ||
| 9665 | |||
| 9666 | if (kvm_get_active_pcid(vcpu) == operand.pcid) { | ||
| 9667 | kvm_mmu_sync_roots(vcpu); | ||
| 9668 | kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); | ||
| 9669 | } | ||
| 9670 | |||
| 9671 | for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) | ||
| 9672 | if (kvm_get_pcid(vcpu, vcpu->arch.mmu->prev_roots[i].cr3) | ||
| 9673 | == operand.pcid) | ||
| 9674 | roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); | ||
| 9675 | |||
| 9676 | kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, roots_to_free); | ||
| 9677 | /* | ||
| 9678 | * If neither the current cr3 nor any of the prev_roots use the | ||
| 9679 | * given PCID, then nothing needs to be done here because a | ||
| 9680 | * resync will happen anyway before switching to any other CR3. | ||
| 9681 | */ | ||
| 9682 | |||
| 9683 | return kvm_skip_emulated_instruction(vcpu); | ||
| 9684 | |||
| 9685 | case INVPCID_TYPE_ALL_NON_GLOBAL: | ||
| 9686 | /* | ||
| 9687 | * Currently, KVM doesn't mark global entries in the shadow | ||
| 9688 | * page tables, so a non-global flush just degenerates to a | ||
| 9689 | * global flush. If needed, we could optimize this later by | ||
| 9690 | * keeping track of global entries in shadow page tables. | ||
| 9691 | */ | ||
| 9692 | |||
| 9693 | /* fall-through */ | ||
| 9694 | case INVPCID_TYPE_ALL_INCL_GLOBAL: | ||
| 9695 | kvm_mmu_unload(vcpu); | ||
| 9696 | return kvm_skip_emulated_instruction(vcpu); | ||
| 9697 | |||
| 9698 | default: | ||
| 9699 | BUG(); /* We have already checked above that type <= 3 */ | ||
| 9700 | } | ||
| 9701 | } | ||
| 9702 | |||
| 9703 | static int handle_pml_full(struct kvm_vcpu *vcpu) | ||
| 9704 | { | ||
| 9705 | unsigned long exit_qualification; | ||
| 9706 | |||
| 9707 | trace_kvm_pml_full(vcpu->vcpu_id); | ||
| 9708 | |||
| 9709 | exit_qualification = vmcs_readl(EXIT_QUALIFICATION); | ||
| 9710 | |||
| 9711 | /* | ||
| 9712 | * PML buffer FULL happened while executing iret from NMI, | ||
| 9713 | * "blocked by NMI" bit has to be set before next VM entry. | ||
| 9714 | */ | ||
| 9715 | if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && | ||
| 9716 | enable_vnmi && | ||
| 9717 | (exit_qualification & INTR_INFO_UNBLOCK_NMI)) | ||
| 9718 | vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, | ||
| 9719 | GUEST_INTR_STATE_NMI); | ||
| 9720 | |||
| 9721 | /* | ||
| 9722 | * PML buffer already flushed at beginning of VMEXIT. Nothing to do | ||
| 9723 | * here.., and there's no userspace involvement needed for PML. | ||
| 9724 | */ | ||
| 9725 | return 1; | ||
| 9726 | } | ||
| 9727 | |||
| 9728 | static int handle_preemption_timer(struct kvm_vcpu *vcpu) | ||
| 9729 | { | ||
| 9730 | if (!to_vmx(vcpu)->req_immediate_exit) | ||
| 9731 | kvm_lapic_expired_hv_timer(vcpu); | ||
| 9732 | return 1; | ||
| 9733 | } | ||
| 9734 | |||
| 9735 | static bool valid_ept_address(struct kvm_vcpu *vcpu, u64 address) | ||
| 9736 | { | ||
| 9737 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 9738 | int maxphyaddr = cpuid_maxphyaddr(vcpu); | ||
| 9739 | |||
| 9740 | /* Check for memory type validity */ | ||
| 9741 | switch (address & VMX_EPTP_MT_MASK) { | ||
| 9742 | case VMX_EPTP_MT_UC: | ||
| 9743 | if (!(vmx->nested.msrs.ept_caps & VMX_EPTP_UC_BIT)) | ||
| 9744 | return false; | ||
| 9745 | break; | ||
| 9746 | case VMX_EPTP_MT_WB: | ||
| 9747 | if (!(vmx->nested.msrs.ept_caps & VMX_EPTP_WB_BIT)) | ||
| 9748 | return false; | ||
| 9749 | break; | ||
| 9750 | default: | ||
| 9751 | return false; | ||
| 9752 | } | ||
| 9753 | |||
| 9754 | /* only 4 levels page-walk length are valid */ | ||
| 9755 | if ((address & VMX_EPTP_PWL_MASK) != VMX_EPTP_PWL_4) | ||
| 9756 | return false; | ||
| 9757 | |||
| 9758 | /* Reserved bits should not be set */ | ||
| 9759 | if (address >> maxphyaddr || ((address >> 7) & 0x1f)) | ||
| 9760 | return false; | ||
| 9761 | |||
| 9762 | /* AD, if set, should be supported */ | ||
| 9763 | if (address & VMX_EPTP_AD_ENABLE_BIT) { | ||
| 9764 | if (!(vmx->nested.msrs.ept_caps & VMX_EPT_AD_BIT)) | ||
| 9765 | return false; | ||
| 9766 | } | ||
| 9767 | |||
| 9768 | return true; | ||
| 9769 | } | ||
| 9770 | |||
| 9771 | static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu, | ||
| 9772 | struct vmcs12 *vmcs12) | ||
| 9773 | { | ||
| 9774 | u32 index = vcpu->arch.regs[VCPU_REGS_RCX]; | ||
| 9775 | u64 address; | ||
| 9776 | bool accessed_dirty; | ||
| 9777 | struct kvm_mmu *mmu = vcpu->arch.walk_mmu; | ||
| 9778 | |||
| 9779 | if (!nested_cpu_has_eptp_switching(vmcs12) || | ||
| 9780 | !nested_cpu_has_ept(vmcs12)) | ||
| 9781 | return 1; | ||
| 9782 | |||
| 9783 | if (index >= VMFUNC_EPTP_ENTRIES) | ||
| 9784 | return 1; | ||
| 9785 | |||
| 9786 | |||
| 9787 | if (kvm_vcpu_read_guest_page(vcpu, vmcs12->eptp_list_address >> PAGE_SHIFT, | ||
| 9788 | &address, index * 8, 8)) | ||
| 9789 | return 1; | ||
| 9790 | |||
| 9791 | accessed_dirty = !!(address & VMX_EPTP_AD_ENABLE_BIT); | ||
| 9792 | |||
| 9793 | /* | ||
| 9794 | * If the (L2) guest does a vmfunc to the currently | ||
| 9795 | * active ept pointer, we don't have to do anything else | ||
| 9796 | */ | ||
| 9797 | if (vmcs12->ept_pointer != address) { | ||
| 9798 | if (!valid_ept_address(vcpu, address)) | ||
| 9799 | return 1; | ||
| 9800 | |||
| 9801 | kvm_mmu_unload(vcpu); | ||
| 9802 | mmu->ept_ad = accessed_dirty; | ||
| 9803 | mmu->mmu_role.base.ad_disabled = !accessed_dirty; | ||
| 9804 | vmcs12->ept_pointer = address; | ||
| 9805 | /* | ||
| 9806 | * TODO: Check what's the correct approach in case | ||
| 9807 | * mmu reload fails. Currently, we just let the next | ||
| 9808 | * reload potentially fail | ||
| 9809 | */ | ||
| 9810 | kvm_mmu_reload(vcpu); | ||
| 9811 | } | ||
| 9812 | |||
| 9813 | return 0; | ||
| 9814 | } | ||
| 9815 | |||
| 9816 | static int handle_vmfunc(struct kvm_vcpu *vcpu) | ||
| 9817 | { | ||
| 9818 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 9819 | struct vmcs12 *vmcs12; | ||
| 9820 | u32 function = vcpu->arch.regs[VCPU_REGS_RAX]; | ||
| 9821 | |||
| 9822 | /* | ||
| 9823 | * VMFUNC is only supported for nested guests, but we always enable the | ||
| 9824 | * secondary control for simplicity; for non-nested mode, fake that we | ||
| 9825 | * didn't by injecting #UD. | ||
| 9826 | */ | ||
| 9827 | if (!is_guest_mode(vcpu)) { | ||
| 9828 | kvm_queue_exception(vcpu, UD_VECTOR); | ||
| 9829 | return 1; | ||
| 9830 | } | ||
| 9831 | |||
| 9832 | vmcs12 = get_vmcs12(vcpu); | ||
| 9833 | if ((vmcs12->vm_function_control & (1 << function)) == 0) | ||
| 9834 | goto fail; | ||
| 9835 | |||
| 9836 | switch (function) { | ||
| 9837 | case 0: | ||
| 9838 | if (nested_vmx_eptp_switching(vcpu, vmcs12)) | ||
| 9839 | goto fail; | ||
| 9840 | break; | ||
| 9841 | default: | ||
| 9842 | goto fail; | ||
| 9843 | } | ||
| 9844 | return kvm_skip_emulated_instruction(vcpu); | ||
| 9845 | |||
| 9846 | fail: | ||
| 9847 | nested_vmx_vmexit(vcpu, vmx->exit_reason, | ||
| 9848 | vmcs_read32(VM_EXIT_INTR_INFO), | ||
| 9849 | vmcs_readl(EXIT_QUALIFICATION)); | ||
| 9850 | return 1; | ||
| 9851 | } | ||
| 9852 | |||
| 9853 | static int handle_encls(struct kvm_vcpu *vcpu) | ||
| 9854 | { | ||
| 9855 | /* | ||
| 9856 | * SGX virtualization is not yet supported. There is no software | ||
| 9857 | * enable bit for SGX, so we have to trap ENCLS and inject a #UD | ||
| 9858 | * to prevent the guest from executing ENCLS. | ||
| 9859 | */ | ||
| 9860 | kvm_queue_exception(vcpu, UD_VECTOR); | ||
| 9861 | return 1; | ||
| 9862 | } | ||
| 9863 | |||
| 9864 | /* | ||
| 9865 | * The exit handlers return 1 if the exit was handled fully and guest execution | ||
| 9866 | * may resume. Otherwise they set the kvm_run parameter to indicate what needs | ||
| 9867 | * to be done to userspace and return 0. | ||
| 9868 | */ | ||
| 9869 | static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { | ||
| 9870 | [EXIT_REASON_EXCEPTION_NMI] = handle_exception, | ||
| 9871 | [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt, | ||
| 9872 | [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault, | ||
| 9873 | [EXIT_REASON_NMI_WINDOW] = handle_nmi_window, | ||
| 9874 | [EXIT_REASON_IO_INSTRUCTION] = handle_io, | ||
| 9875 | [EXIT_REASON_CR_ACCESS] = handle_cr, | ||
| 9876 | [EXIT_REASON_DR_ACCESS] = handle_dr, | ||
| 9877 | [EXIT_REASON_CPUID] = handle_cpuid, | ||
| 9878 | [EXIT_REASON_MSR_READ] = handle_rdmsr, | ||
| 9879 | [EXIT_REASON_MSR_WRITE] = handle_wrmsr, | ||
| 9880 | [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window, | ||
| 9881 | [EXIT_REASON_HLT] = handle_halt, | ||
| 9882 | [EXIT_REASON_INVD] = handle_invd, | ||
| 9883 | [EXIT_REASON_INVLPG] = handle_invlpg, | ||
| 9884 | [EXIT_REASON_RDPMC] = handle_rdpmc, | ||
| 9885 | [EXIT_REASON_VMCALL] = handle_vmcall, | ||
| 9886 | [EXIT_REASON_VMCLEAR] = handle_vmclear, | ||
| 9887 | [EXIT_REASON_VMLAUNCH] = handle_vmlaunch, | ||
| 9888 | [EXIT_REASON_VMPTRLD] = handle_vmptrld, | ||
| 9889 | [EXIT_REASON_VMPTRST] = handle_vmptrst, | ||
| 9890 | [EXIT_REASON_VMREAD] = handle_vmread, | ||
| 9891 | [EXIT_REASON_VMRESUME] = handle_vmresume, | ||
| 9892 | [EXIT_REASON_VMWRITE] = handle_vmwrite, | ||
| 9893 | [EXIT_REASON_VMOFF] = handle_vmoff, | ||
| 9894 | [EXIT_REASON_VMON] = handle_vmon, | ||
| 9895 | [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, | ||
| 9896 | [EXIT_REASON_APIC_ACCESS] = handle_apic_access, | ||
| 9897 | [EXIT_REASON_APIC_WRITE] = handle_apic_write, | ||
| 9898 | [EXIT_REASON_EOI_INDUCED] = handle_apic_eoi_induced, | ||
| 9899 | [EXIT_REASON_WBINVD] = handle_wbinvd, | ||
| 9900 | [EXIT_REASON_XSETBV] = handle_xsetbv, | ||
| 9901 | [EXIT_REASON_TASK_SWITCH] = handle_task_switch, | ||
| 9902 | [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check, | ||
| 9903 | [EXIT_REASON_GDTR_IDTR] = handle_desc, | ||
| 9904 | [EXIT_REASON_LDTR_TR] = handle_desc, | ||
| 9905 | [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, | ||
| 9906 | [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig, | ||
| 9907 | [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause, | ||
| 9908 | [EXIT_REASON_MWAIT_INSTRUCTION] = handle_mwait, | ||
| 9909 | [EXIT_REASON_MONITOR_TRAP_FLAG] = handle_monitor_trap, | ||
| 9910 | [EXIT_REASON_MONITOR_INSTRUCTION] = handle_monitor, | ||
| 9911 | [EXIT_REASON_INVEPT] = handle_invept, | ||
| 9912 | [EXIT_REASON_INVVPID] = handle_invvpid, | ||
| 9913 | [EXIT_REASON_RDRAND] = handle_invalid_op, | ||
| 9914 | [EXIT_REASON_RDSEED] = handle_invalid_op, | ||
| 9915 | [EXIT_REASON_XSAVES] = handle_xsaves, | ||
| 9916 | [EXIT_REASON_XRSTORS] = handle_xrstors, | ||
| 9917 | [EXIT_REASON_PML_FULL] = handle_pml_full, | ||
| 9918 | [EXIT_REASON_INVPCID] = handle_invpcid, | ||
| 9919 | [EXIT_REASON_VMFUNC] = handle_vmfunc, | ||
| 9920 | [EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer, | ||
| 9921 | [EXIT_REASON_ENCLS] = handle_encls, | ||
| 9922 | }; | ||
| 9923 | |||
| 9924 | static const int kvm_vmx_max_exit_handlers = | ||
| 9925 | ARRAY_SIZE(kvm_vmx_exit_handlers); | ||
| 9926 | |||
| 9927 | static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu, | ||
| 9928 | struct vmcs12 *vmcs12) | ||
| 9929 | { | ||
| 9930 | unsigned long exit_qualification; | ||
| 9931 | gpa_t bitmap, last_bitmap; | ||
| 9932 | unsigned int port; | ||
| 9933 | int size; | ||
| 9934 | u8 b; | ||
| 9935 | |||
| 9936 | if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) | ||
| 9937 | return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING); | ||
| 9938 | |||
| 9939 | exit_qualification = vmcs_readl(EXIT_QUALIFICATION); | ||
| 9940 | |||
| 9941 | port = exit_qualification >> 16; | ||
| 9942 | size = (exit_qualification & 7) + 1; | ||
| 9943 | |||
| 9944 | last_bitmap = (gpa_t)-1; | ||
| 9945 | b = -1; | ||
| 9946 | |||
| 9947 | while (size > 0) { | ||
| 9948 | if (port < 0x8000) | ||
| 9949 | bitmap = vmcs12->io_bitmap_a; | ||
| 9950 | else if (port < 0x10000) | ||
| 9951 | bitmap = vmcs12->io_bitmap_b; | ||
| 9952 | else | ||
| 9953 | return true; | ||
| 9954 | bitmap += (port & 0x7fff) / 8; | ||
| 9955 | |||
| 9956 | if (last_bitmap != bitmap) | ||
| 9957 | if (kvm_vcpu_read_guest(vcpu, bitmap, &b, 1)) | ||
| 9958 | return true; | ||
| 9959 | if (b & (1 << (port & 7))) | ||
| 9960 | return true; | ||
| 9961 | |||
| 9962 | port++; | ||
| 9963 | size--; | ||
| 9964 | last_bitmap = bitmap; | ||
| 9965 | } | ||
| 9966 | |||
| 9967 | return false; | ||
| 9968 | } | ||
| 9969 | |||
| 9970 | /* | ||
| 9971 | * Return 1 if we should exit from L2 to L1 to handle an MSR access access, | ||
| 9972 | * rather than handle it ourselves in L0. I.e., check whether L1 expressed | ||
| 9973 | * disinterest in the current event (read or write a specific MSR) by using an | ||
| 9974 | * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps. | ||
| 9975 | */ | ||
| 9976 | static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu, | ||
| 9977 | struct vmcs12 *vmcs12, u32 exit_reason) | ||
| 9978 | { | ||
| 9979 | u32 msr_index = vcpu->arch.regs[VCPU_REGS_RCX]; | ||
| 9980 | gpa_t bitmap; | ||
| 9981 | |||
| 9982 | if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) | ||
| 9983 | return true; | ||
| 9984 | |||
| 9985 | /* | ||
| 9986 | * The MSR_BITMAP page is divided into four 1024-byte bitmaps, | ||
| 9987 | * for the four combinations of read/write and low/high MSR numbers. | ||
| 9988 | * First we need to figure out which of the four to use: | ||
| 9989 | */ | ||
| 9990 | bitmap = vmcs12->msr_bitmap; | ||
| 9991 | if (exit_reason == EXIT_REASON_MSR_WRITE) | ||
| 9992 | bitmap += 2048; | ||
| 9993 | if (msr_index >= 0xc0000000) { | ||
| 9994 | msr_index -= 0xc0000000; | ||
| 9995 | bitmap += 1024; | ||
| 9996 | } | ||
| 9997 | |||
| 9998 | /* Then read the msr_index'th bit from this bitmap: */ | ||
| 9999 | if (msr_index < 1024*8) { | ||
| 10000 | unsigned char b; | ||
| 10001 | if (kvm_vcpu_read_guest(vcpu, bitmap + msr_index/8, &b, 1)) | ||
| 10002 | return true; | ||
| 10003 | return 1 & (b >> (msr_index & 7)); | ||
| 10004 | } else | ||
| 10005 | return true; /* let L1 handle the wrong parameter */ | ||
| 10006 | } | ||
| 10007 | |||
| 10008 | /* | ||
| 10009 | * Return 1 if we should exit from L2 to L1 to handle a CR access exit, | ||
| 10010 | * rather than handle it ourselves in L0. I.e., check if L1 wanted to | ||
| 10011 | * intercept (via guest_host_mask etc.) the current event. | ||
| 10012 | */ | ||
| 10013 | static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, | ||
| 10014 | struct vmcs12 *vmcs12) | ||
| 10015 | { | ||
| 10016 | unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); | ||
| 10017 | int cr = exit_qualification & 15; | ||
| 10018 | int reg; | ||
| 10019 | unsigned long val; | ||
| 10020 | |||
| 10021 | switch ((exit_qualification >> 4) & 3) { | ||
| 10022 | case 0: /* mov to cr */ | ||
| 10023 | reg = (exit_qualification >> 8) & 15; | ||
| 10024 | val = kvm_register_readl(vcpu, reg); | ||
| 10025 | switch (cr) { | ||
| 10026 | case 0: | ||
| 10027 | if (vmcs12->cr0_guest_host_mask & | ||
| 10028 | (val ^ vmcs12->cr0_read_shadow)) | ||
| 10029 | return true; | ||
| 10030 | break; | ||
| 10031 | case 3: | ||
| 10032 | if ((vmcs12->cr3_target_count >= 1 && | ||
| 10033 | vmcs12->cr3_target_value0 == val) || | ||
| 10034 | (vmcs12->cr3_target_count >= 2 && | ||
| 10035 | vmcs12->cr3_target_value1 == val) || | ||
| 10036 | (vmcs12->cr3_target_count >= 3 && | ||
| 10037 | vmcs12->cr3_target_value2 == val) || | ||
| 10038 | (vmcs12->cr3_target_count >= 4 && | ||
| 10039 | vmcs12->cr3_target_value3 == val)) | ||
| 10040 | return false; | ||
| 10041 | if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING)) | ||
| 10042 | return true; | ||
| 10043 | break; | ||
| 10044 | case 4: | ||
| 10045 | if (vmcs12->cr4_guest_host_mask & | ||
| 10046 | (vmcs12->cr4_read_shadow ^ val)) | ||
| 10047 | return true; | ||
| 10048 | break; | ||
| 10049 | case 8: | ||
| 10050 | if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING)) | ||
| 10051 | return true; | ||
| 10052 | break; | ||
| 10053 | } | ||
| 10054 | break; | ||
| 10055 | case 2: /* clts */ | ||
| 10056 | if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) && | ||
| 10057 | (vmcs12->cr0_read_shadow & X86_CR0_TS)) | ||
| 10058 | return true; | ||
| 10059 | break; | ||
| 10060 | case 1: /* mov from cr */ | ||
| 10061 | switch (cr) { | ||
| 10062 | case 3: | ||
| 10063 | if (vmcs12->cpu_based_vm_exec_control & | ||
| 10064 | CPU_BASED_CR3_STORE_EXITING) | ||
| 10065 | return true; | ||
| 10066 | break; | ||
| 10067 | case 8: | ||
| 10068 | if (vmcs12->cpu_based_vm_exec_control & | ||
| 10069 | CPU_BASED_CR8_STORE_EXITING) | ||
| 10070 | return true; | ||
| 10071 | break; | ||
| 10072 | } | ||
| 10073 | break; | ||
| 10074 | case 3: /* lmsw */ | ||
| 10075 | /* | ||
| 10076 | * lmsw can change bits 1..3 of cr0, and only set bit 0 of | ||
| 10077 | * cr0. Other attempted changes are ignored, with no exit. | ||
| 10078 | */ | ||
| 10079 | val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f; | ||
| 10080 | if (vmcs12->cr0_guest_host_mask & 0xe & | ||
| 10081 | (val ^ vmcs12->cr0_read_shadow)) | ||
| 10082 | return true; | ||
| 10083 | if ((vmcs12->cr0_guest_host_mask & 0x1) && | ||
| 10084 | !(vmcs12->cr0_read_shadow & 0x1) && | ||
| 10085 | (val & 0x1)) | ||
| 10086 | return true; | ||
| 10087 | break; | ||
| 10088 | } | ||
| 10089 | return false; | ||
| 10090 | } | ||
| 10091 | |||
| 10092 | static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu, | ||
| 10093 | struct vmcs12 *vmcs12, gpa_t bitmap) | ||
| 10094 | { | ||
| 10095 | u32 vmx_instruction_info; | ||
| 10096 | unsigned long field; | ||
| 10097 | u8 b; | ||
| 10098 | |||
| 10099 | if (!nested_cpu_has_shadow_vmcs(vmcs12)) | ||
| 10100 | return true; | ||
| 10101 | |||
| 10102 | /* Decode instruction info and find the field to access */ | ||
| 10103 | vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); | ||
| 10104 | field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); | ||
| 10105 | |||
| 10106 | /* Out-of-range fields always cause a VM exit from L2 to L1 */ | ||
| 10107 | if (field >> 15) | ||
| 10108 | return true; | ||
| 10109 | |||
| 10110 | if (kvm_vcpu_read_guest(vcpu, bitmap + field/8, &b, 1)) | ||
| 10111 | return true; | ||
| 10112 | |||
| 10113 | return 1 & (b >> (field & 7)); | ||
| 10114 | } | ||
| 10115 | |||
| 10116 | /* | ||
| 10117 | * Return 1 if we should exit from L2 to L1 to handle an exit, or 0 if we | ||
| 10118 | * should handle it ourselves in L0 (and then continue L2). Only call this | ||
| 10119 | * when in is_guest_mode (L2). | ||
| 10120 | */ | ||
| 10121 | static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason) | ||
| 10122 | { | ||
| 10123 | u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO); | ||
| 10124 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 10125 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); | ||
| 10126 | |||
| 10127 | if (vmx->nested.nested_run_pending) | ||
| 10128 | return false; | ||
| 10129 | |||
| 10130 | if (unlikely(vmx->fail)) { | ||
| 10131 | pr_info_ratelimited("%s failed vm entry %x\n", __func__, | ||
| 10132 | vmcs_read32(VM_INSTRUCTION_ERROR)); | ||
| 10133 | return true; | ||
| 10134 | } | ||
| 10135 | |||
| 10136 | /* | ||
| 10137 | * The host physical addresses of some pages of guest memory | ||
| 10138 | * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC | ||
| 10139 | * Page). The CPU may write to these pages via their host | ||
| 10140 | * physical address while L2 is running, bypassing any | ||
| 10141 | * address-translation-based dirty tracking (e.g. EPT write | ||
| 10142 | * protection). | ||
| 10143 | * | ||
| 10144 | * Mark them dirty on every exit from L2 to prevent them from | ||
| 10145 | * getting out of sync with dirty tracking. | ||
| 10146 | */ | ||
| 10147 | nested_mark_vmcs12_pages_dirty(vcpu); | ||
| 10148 | |||
| 10149 | trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason, | ||
| 10150 | vmcs_readl(EXIT_QUALIFICATION), | ||
| 10151 | vmx->idt_vectoring_info, | ||
| 10152 | intr_info, | ||
| 10153 | vmcs_read32(VM_EXIT_INTR_ERROR_CODE), | ||
| 10154 | KVM_ISA_VMX); | ||
| 10155 | |||
| 10156 | switch (exit_reason) { | ||
| 10157 | case EXIT_REASON_EXCEPTION_NMI: | ||
| 10158 | if (is_nmi(intr_info)) | ||
| 10159 | return false; | ||
| 10160 | else if (is_page_fault(intr_info)) | ||
| 10161 | return !vmx->vcpu.arch.apf.host_apf_reason && enable_ept; | ||
| 10162 | else if (is_debug(intr_info) && | ||
| 10163 | vcpu->guest_debug & | ||
| 10164 | (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) | ||
| 10165 | return false; | ||
| 10166 | else if (is_breakpoint(intr_info) && | ||
| 10167 | vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) | ||
| 10168 | return false; | ||
| 10169 | return vmcs12->exception_bitmap & | ||
| 10170 | (1u << (intr_info & INTR_INFO_VECTOR_MASK)); | ||
| 10171 | case EXIT_REASON_EXTERNAL_INTERRUPT: | ||
| 10172 | return false; | ||
| 10173 | case EXIT_REASON_TRIPLE_FAULT: | ||
| 10174 | return true; | ||
| 10175 | case EXIT_REASON_PENDING_INTERRUPT: | ||
| 10176 | return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_INTR_PENDING); | ||
| 10177 | case EXIT_REASON_NMI_WINDOW: | ||
| 10178 | return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING); | ||
| 10179 | case EXIT_REASON_TASK_SWITCH: | ||
| 10180 | return true; | ||
| 10181 | case EXIT_REASON_CPUID: | ||
| 10182 | return true; | ||
| 10183 | case EXIT_REASON_HLT: | ||
| 10184 | return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING); | ||
| 10185 | case EXIT_REASON_INVD: | ||
| 10186 | return true; | ||
| 10187 | case EXIT_REASON_INVLPG: | ||
| 10188 | return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); | ||
| 10189 | case EXIT_REASON_RDPMC: | ||
| 10190 | return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING); | ||
| 10191 | case EXIT_REASON_RDRAND: | ||
| 10192 | return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND_EXITING); | ||
| 10193 | case EXIT_REASON_RDSEED: | ||
| 10194 | return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING); | ||
| 10195 | case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP: | ||
| 10196 | return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING); | ||
| 10197 | case EXIT_REASON_VMREAD: | ||
| 10198 | return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, | ||
| 10199 | vmcs12->vmread_bitmap); | ||
| 10200 | case EXIT_REASON_VMWRITE: | ||
| 10201 | return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, | ||
| 10202 | vmcs12->vmwrite_bitmap); | ||
| 10203 | case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR: | ||
| 10204 | case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD: | ||
| 10205 | case EXIT_REASON_VMPTRST: case EXIT_REASON_VMRESUME: | ||
| 10206 | case EXIT_REASON_VMOFF: case EXIT_REASON_VMON: | ||
| 10207 | case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID: | ||
| 10208 | /* | ||
| 10209 | * VMX instructions trap unconditionally. This allows L1 to | ||
| 10210 | * emulate them for its L2 guest, i.e., allows 3-level nesting! | ||
| 10211 | */ | ||
| 10212 | return true; | ||
| 10213 | case EXIT_REASON_CR_ACCESS: | ||
| 10214 | return nested_vmx_exit_handled_cr(vcpu, vmcs12); | ||
| 10215 | case EXIT_REASON_DR_ACCESS: | ||
| 10216 | return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING); | ||
| 10217 | case EXIT_REASON_IO_INSTRUCTION: | ||
| 10218 | return nested_vmx_exit_handled_io(vcpu, vmcs12); | ||
| 10219 | case EXIT_REASON_GDTR_IDTR: case EXIT_REASON_LDTR_TR: | ||
| 10220 | return nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC); | ||
| 10221 | case EXIT_REASON_MSR_READ: | ||
| 10222 | case EXIT_REASON_MSR_WRITE: | ||
| 10223 | return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason); | ||
| 10224 | case EXIT_REASON_INVALID_STATE: | ||
| 10225 | return true; | ||
| 10226 | case EXIT_REASON_MWAIT_INSTRUCTION: | ||
| 10227 | return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING); | ||
| 10228 | case EXIT_REASON_MONITOR_TRAP_FLAG: | ||
| 10229 | return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_TRAP_FLAG); | ||
| 10230 | case EXIT_REASON_MONITOR_INSTRUCTION: | ||
| 10231 | return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING); | ||
| 10232 | case EXIT_REASON_PAUSE_INSTRUCTION: | ||
| 10233 | return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) || | ||
| 10234 | nested_cpu_has2(vmcs12, | ||
| 10235 | SECONDARY_EXEC_PAUSE_LOOP_EXITING); | ||
| 10236 | case EXIT_REASON_MCE_DURING_VMENTRY: | ||
| 10237 | return false; | ||
| 10238 | case EXIT_REASON_TPR_BELOW_THRESHOLD: | ||
| 10239 | return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW); | ||
| 10240 | case EXIT_REASON_APIC_ACCESS: | ||
| 10241 | case EXIT_REASON_APIC_WRITE: | ||
| 10242 | case EXIT_REASON_EOI_INDUCED: | ||
| 10243 | /* | ||
| 10244 | * The controls for "virtualize APIC accesses," "APIC- | ||
| 10245 | * register virtualization," and "virtual-interrupt | ||
| 10246 | * delivery" only come from vmcs12. | ||
| 10247 | */ | ||
| 10248 | return true; | ||
| 10249 | case EXIT_REASON_EPT_VIOLATION: | ||
| 10250 | /* | ||
| 10251 | * L0 always deals with the EPT violation. If nested EPT is | ||
| 10252 | * used, and the nested mmu code discovers that the address is | ||
| 10253 | * missing in the guest EPT table (EPT12), the EPT violation | ||
| 10254 | * will be injected with nested_ept_inject_page_fault() | ||
| 10255 | */ | ||
| 10256 | return false; | ||
| 10257 | case EXIT_REASON_EPT_MISCONFIG: | ||
| 10258 | /* | ||
| 10259 | * L2 never uses directly L1's EPT, but rather L0's own EPT | ||
| 10260 | * table (shadow on EPT) or a merged EPT table that L0 built | ||
| 10261 | * (EPT on EPT). So any problems with the structure of the | ||
| 10262 | * table is L0's fault. | ||
| 10263 | */ | ||
| 10264 | return false; | ||
| 10265 | case EXIT_REASON_INVPCID: | ||
| 10266 | return | ||
| 10267 | nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_INVPCID) && | ||
| 10268 | nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); | ||
| 10269 | case EXIT_REASON_WBINVD: | ||
| 10270 | return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING); | ||
| 10271 | case EXIT_REASON_XSETBV: | ||
| 10272 | return true; | ||
| 10273 | case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS: | ||
| 10274 | /* | ||
| 10275 | * This should never happen, since it is not possible to | ||
| 10276 | * set XSS to a non-zero value---neither in L1 nor in L2. | ||
| 10277 | * If if it were, XSS would have to be checked against | ||
| 10278 | * the XSS exit bitmap in vmcs12. | ||
| 10279 | */ | ||
| 10280 | return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES); | ||
| 10281 | case EXIT_REASON_PREEMPTION_TIMER: | ||
| 10282 | return false; | ||
| 10283 | case EXIT_REASON_PML_FULL: | ||
| 10284 | /* We emulate PML support to L1. */ | ||
| 10285 | return false; | ||
| 10286 | case EXIT_REASON_VMFUNC: | ||
| 10287 | /* VM functions are emulated through L2->L0 vmexits. */ | ||
| 10288 | return false; | ||
| 10289 | case EXIT_REASON_ENCLS: | ||
| 10290 | /* SGX is never exposed to L1 */ | ||
| 10291 | return false; | ||
| 10292 | default: | ||
| 10293 | return true; | ||
| 10294 | } | ||
| 10295 | } | ||
| 10296 | |||
| 10297 | static int nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason) | ||
| 10298 | { | ||
| 10299 | u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); | ||
| 10300 | |||
| 10301 | /* | ||
| 10302 | * At this point, the exit interruption info in exit_intr_info | ||
| 10303 | * is only valid for EXCEPTION_NMI exits. For EXTERNAL_INTERRUPT | ||
| 10304 | * we need to query the in-kernel LAPIC. | ||
| 10305 | */ | ||
| 10306 | WARN_ON(exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT); | ||
| 10307 | if ((exit_intr_info & | ||
| 10308 | (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) == | ||
| 10309 | (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) { | ||
| 10310 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); | ||
| 10311 | vmcs12->vm_exit_intr_error_code = | ||
| 10312 | vmcs_read32(VM_EXIT_INTR_ERROR_CODE); | ||
| 10313 | } | ||
| 10314 | |||
| 10315 | nested_vmx_vmexit(vcpu, exit_reason, exit_intr_info, | ||
| 10316 | vmcs_readl(EXIT_QUALIFICATION)); | ||
| 10317 | return 1; | ||
| 10318 | } | ||
| 10319 | |||
| 10320 | static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) | ||
| 10321 | { | ||
| 10322 | *info1 = vmcs_readl(EXIT_QUALIFICATION); | ||
| 10323 | *info2 = vmcs_read32(VM_EXIT_INTR_INFO); | ||
| 10324 | } | ||
| 10325 | |||
| 10326 | static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx) | ||
| 10327 | { | ||
| 10328 | if (vmx->pml_pg) { | ||
| 10329 | __free_page(vmx->pml_pg); | ||
| 10330 | vmx->pml_pg = NULL; | ||
| 10331 | } | ||
| 10332 | } | ||
| 10333 | |||
| 10334 | static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu) | ||
| 10335 | { | ||
| 10336 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 10337 | u64 *pml_buf; | ||
| 10338 | u16 pml_idx; | ||
| 10339 | |||
| 10340 | pml_idx = vmcs_read16(GUEST_PML_INDEX); | ||
| 10341 | |||
| 10342 | /* Do nothing if PML buffer is empty */ | ||
| 10343 | if (pml_idx == (PML_ENTITY_NUM - 1)) | ||
| 10344 | return; | ||
| 10345 | |||
| 10346 | /* PML index always points to next available PML buffer entity */ | ||
| 10347 | if (pml_idx >= PML_ENTITY_NUM) | ||
| 10348 | pml_idx = 0; | ||
| 10349 | else | ||
| 10350 | pml_idx++; | ||
| 10351 | |||
| 10352 | pml_buf = page_address(vmx->pml_pg); | ||
| 10353 | for (; pml_idx < PML_ENTITY_NUM; pml_idx++) { | ||
| 10354 | u64 gpa; | ||
| 10355 | |||
| 10356 | gpa = pml_buf[pml_idx]; | ||
| 10357 | WARN_ON(gpa & (PAGE_SIZE - 1)); | ||
| 10358 | kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT); | ||
| 10359 | } | ||
| 10360 | |||
| 10361 | /* reset PML index */ | ||
| 10362 | vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); | ||
| 10363 | } | ||
| 10364 | |||
| 10365 | /* | ||
| 10366 | * Flush all vcpus' PML buffer and update logged GPAs to dirty_bitmap. | ||
| 10367 | * Called before reporting dirty_bitmap to userspace. | ||
| 10368 | */ | ||
| 10369 | static void kvm_flush_pml_buffers(struct kvm *kvm) | ||
| 10370 | { | ||
| 10371 | int i; | ||
| 10372 | struct kvm_vcpu *vcpu; | ||
| 10373 | /* | ||
| 10374 | * We only need to kick vcpu out of guest mode here, as PML buffer | ||
| 10375 | * is flushed at beginning of all VMEXITs, and it's obvious that only | ||
| 10376 | * vcpus running in guest are possible to have unflushed GPAs in PML | ||
| 10377 | * buffer. | ||
| 10378 | */ | ||
| 10379 | kvm_for_each_vcpu(i, vcpu, kvm) | ||
| 10380 | kvm_vcpu_kick(vcpu); | ||
| 10381 | } | ||
| 10382 | |||
| 10383 | static void vmx_dump_sel(char *name, uint32_t sel) | ||
| 10384 | { | ||
| 10385 | pr_err("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016lx\n", | ||
| 10386 | name, vmcs_read16(sel), | ||
| 10387 | vmcs_read32(sel + GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR), | ||
| 10388 | vmcs_read32(sel + GUEST_ES_LIMIT - GUEST_ES_SELECTOR), | ||
| 10389 | vmcs_readl(sel + GUEST_ES_BASE - GUEST_ES_SELECTOR)); | ||
| 10390 | } | ||
| 10391 | |||
| 10392 | static void vmx_dump_dtsel(char *name, uint32_t limit) | ||
| 10393 | { | ||
| 10394 | pr_err("%s limit=0x%08x, base=0x%016lx\n", | ||
| 10395 | name, vmcs_read32(limit), | ||
| 10396 | vmcs_readl(limit + GUEST_GDTR_BASE - GUEST_GDTR_LIMIT)); | ||
| 10397 | } | ||
| 10398 | |||
| 10399 | static void dump_vmcs(void) | ||
| 10400 | { | ||
| 10401 | u32 vmentry_ctl = vmcs_read32(VM_ENTRY_CONTROLS); | ||
| 10402 | u32 vmexit_ctl = vmcs_read32(VM_EXIT_CONTROLS); | ||
| 10403 | u32 cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); | ||
| 10404 | u32 pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL); | ||
| 10405 | u32 secondary_exec_control = 0; | ||
| 10406 | unsigned long cr4 = vmcs_readl(GUEST_CR4); | ||
| 10407 | u64 efer = vmcs_read64(GUEST_IA32_EFER); | ||
| 10408 | int i, n; | ||
| 10409 | |||
| 10410 | if (cpu_has_secondary_exec_ctrls()) | ||
| 10411 | secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); | ||
| 10412 | |||
| 10413 | pr_err("*** Guest State ***\n"); | ||
| 10414 | pr_err("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n", | ||
| 10415 | vmcs_readl(GUEST_CR0), vmcs_readl(CR0_READ_SHADOW), | ||
| 10416 | vmcs_readl(CR0_GUEST_HOST_MASK)); | ||
| 10417 | pr_err("CR4: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n", | ||
| 10418 | cr4, vmcs_readl(CR4_READ_SHADOW), vmcs_readl(CR4_GUEST_HOST_MASK)); | ||
| 10419 | pr_err("CR3 = 0x%016lx\n", vmcs_readl(GUEST_CR3)); | ||
| 10420 | if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT) && | ||
| 10421 | (cr4 & X86_CR4_PAE) && !(efer & EFER_LMA)) | ||
| 10422 | { | ||
| 10423 | pr_err("PDPTR0 = 0x%016llx PDPTR1 = 0x%016llx\n", | ||
| 10424 | vmcs_read64(GUEST_PDPTR0), vmcs_read64(GUEST_PDPTR1)); | ||
| 10425 | pr_err("PDPTR2 = 0x%016llx PDPTR3 = 0x%016llx\n", | ||
| 10426 | vmcs_read64(GUEST_PDPTR2), vmcs_read64(GUEST_PDPTR3)); | ||
| 10427 | } | ||
| 10428 | pr_err("RSP = 0x%016lx RIP = 0x%016lx\n", | ||
| 10429 | vmcs_readl(GUEST_RSP), vmcs_readl(GUEST_RIP)); | ||
| 10430 | pr_err("RFLAGS=0x%08lx DR7 = 0x%016lx\n", | ||
| 10431 | vmcs_readl(GUEST_RFLAGS), vmcs_readl(GUEST_DR7)); | ||
| 10432 | pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n", | ||
| 10433 | vmcs_readl(GUEST_SYSENTER_ESP), | ||
| 10434 | vmcs_read32(GUEST_SYSENTER_CS), vmcs_readl(GUEST_SYSENTER_EIP)); | ||
| 10435 | vmx_dump_sel("CS: ", GUEST_CS_SELECTOR); | ||
| 10436 | vmx_dump_sel("DS: ", GUEST_DS_SELECTOR); | ||
| 10437 | vmx_dump_sel("SS: ", GUEST_SS_SELECTOR); | ||
| 10438 | vmx_dump_sel("ES: ", GUEST_ES_SELECTOR); | ||
| 10439 | vmx_dump_sel("FS: ", GUEST_FS_SELECTOR); | ||
| 10440 | vmx_dump_sel("GS: ", GUEST_GS_SELECTOR); | ||
| 10441 | vmx_dump_dtsel("GDTR:", GUEST_GDTR_LIMIT); | ||
| 10442 | vmx_dump_sel("LDTR:", GUEST_LDTR_SELECTOR); | ||
| 10443 | vmx_dump_dtsel("IDTR:", GUEST_IDTR_LIMIT); | ||
| 10444 | vmx_dump_sel("TR: ", GUEST_TR_SELECTOR); | ||
| 10445 | if ((vmexit_ctl & (VM_EXIT_SAVE_IA32_PAT | VM_EXIT_SAVE_IA32_EFER)) || | ||
| 10446 | (vmentry_ctl & (VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_IA32_EFER))) | ||
| 10447 | pr_err("EFER = 0x%016llx PAT = 0x%016llx\n", | ||
| 10448 | efer, vmcs_read64(GUEST_IA32_PAT)); | ||
| 10449 | pr_err("DebugCtl = 0x%016llx DebugExceptions = 0x%016lx\n", | ||
| 10450 | vmcs_read64(GUEST_IA32_DEBUGCTL), | ||
| 10451 | vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS)); | ||
| 10452 | if (cpu_has_load_perf_global_ctrl && | ||
| 10453 | vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) | ||
| 10454 | pr_err("PerfGlobCtl = 0x%016llx\n", | ||
| 10455 | vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL)); | ||
| 10456 | if (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS) | ||
| 10457 | pr_err("BndCfgS = 0x%016llx\n", vmcs_read64(GUEST_BNDCFGS)); | ||
| 10458 | pr_err("Interruptibility = %08x ActivityState = %08x\n", | ||
| 10459 | vmcs_read32(GUEST_INTERRUPTIBILITY_INFO), | ||
| 10460 | vmcs_read32(GUEST_ACTIVITY_STATE)); | ||
| 10461 | if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) | ||
| 10462 | pr_err("InterruptStatus = %04x\n", | ||
| 10463 | vmcs_read16(GUEST_INTR_STATUS)); | ||
| 10464 | |||
| 10465 | pr_err("*** Host State ***\n"); | ||
| 10466 | pr_err("RIP = 0x%016lx RSP = 0x%016lx\n", | ||
| 10467 | vmcs_readl(HOST_RIP), vmcs_readl(HOST_RSP)); | ||
| 10468 | pr_err("CS=%04x SS=%04x DS=%04x ES=%04x FS=%04x GS=%04x TR=%04x\n", | ||
| 10469 | vmcs_read16(HOST_CS_SELECTOR), vmcs_read16(HOST_SS_SELECTOR), | ||
| 10470 | vmcs_read16(HOST_DS_SELECTOR), vmcs_read16(HOST_ES_SELECTOR), | ||
| 10471 | vmcs_read16(HOST_FS_SELECTOR), vmcs_read16(HOST_GS_SELECTOR), | ||
| 10472 | vmcs_read16(HOST_TR_SELECTOR)); | ||
| 10473 | pr_err("FSBase=%016lx GSBase=%016lx TRBase=%016lx\n", | ||
| 10474 | vmcs_readl(HOST_FS_BASE), vmcs_readl(HOST_GS_BASE), | ||
| 10475 | vmcs_readl(HOST_TR_BASE)); | ||
| 10476 | pr_err("GDTBase=%016lx IDTBase=%016lx\n", | ||
| 10477 | vmcs_readl(HOST_GDTR_BASE), vmcs_readl(HOST_IDTR_BASE)); | ||
| 10478 | pr_err("CR0=%016lx CR3=%016lx CR4=%016lx\n", | ||
| 10479 | vmcs_readl(HOST_CR0), vmcs_readl(HOST_CR3), | ||
| 10480 | vmcs_readl(HOST_CR4)); | ||
| 10481 | pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n", | ||
| 10482 | vmcs_readl(HOST_IA32_SYSENTER_ESP), | ||
| 10483 | vmcs_read32(HOST_IA32_SYSENTER_CS), | ||
| 10484 | vmcs_readl(HOST_IA32_SYSENTER_EIP)); | ||
| 10485 | if (vmexit_ctl & (VM_EXIT_LOAD_IA32_PAT | VM_EXIT_LOAD_IA32_EFER)) | ||
| 10486 | pr_err("EFER = 0x%016llx PAT = 0x%016llx\n", | ||
| 10487 | vmcs_read64(HOST_IA32_EFER), | ||
| 10488 | vmcs_read64(HOST_IA32_PAT)); | ||
| 10489 | if (cpu_has_load_perf_global_ctrl && | ||
| 10490 | vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) | ||
| 10491 | pr_err("PerfGlobCtl = 0x%016llx\n", | ||
| 10492 | vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL)); | ||
| 10493 | |||
| 10494 | pr_err("*** Control State ***\n"); | ||
| 10495 | pr_err("PinBased=%08x CPUBased=%08x SecondaryExec=%08x\n", | ||
| 10496 | pin_based_exec_ctrl, cpu_based_exec_ctrl, secondary_exec_control); | ||
| 10497 | pr_err("EntryControls=%08x ExitControls=%08x\n", vmentry_ctl, vmexit_ctl); | ||
| 10498 | pr_err("ExceptionBitmap=%08x PFECmask=%08x PFECmatch=%08x\n", | ||
| 10499 | vmcs_read32(EXCEPTION_BITMAP), | ||
| 10500 | vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK), | ||
| 10501 | vmcs_read32(PAGE_FAULT_ERROR_CODE_MATCH)); | ||
| 10502 | pr_err("VMEntry: intr_info=%08x errcode=%08x ilen=%08x\n", | ||
| 10503 | vmcs_read32(VM_ENTRY_INTR_INFO_FIELD), | ||
| 10504 | vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE), | ||
| 10505 | vmcs_read32(VM_ENTRY_INSTRUCTION_LEN)); | ||
| 10506 | pr_err("VMExit: intr_info=%08x errcode=%08x ilen=%08x\n", | ||
| 10507 | vmcs_read32(VM_EXIT_INTR_INFO), | ||
| 10508 | vmcs_read32(VM_EXIT_INTR_ERROR_CODE), | ||
| 10509 | vmcs_read32(VM_EXIT_INSTRUCTION_LEN)); | ||
| 10510 | pr_err(" reason=%08x qualification=%016lx\n", | ||
| 10511 | vmcs_read32(VM_EXIT_REASON), vmcs_readl(EXIT_QUALIFICATION)); | ||
| 10512 | pr_err("IDTVectoring: info=%08x errcode=%08x\n", | ||
| 10513 | vmcs_read32(IDT_VECTORING_INFO_FIELD), | ||
| 10514 | vmcs_read32(IDT_VECTORING_ERROR_CODE)); | ||
| 10515 | pr_err("TSC Offset = 0x%016llx\n", vmcs_read64(TSC_OFFSET)); | ||
| 10516 | if (secondary_exec_control & SECONDARY_EXEC_TSC_SCALING) | ||
| 10517 | pr_err("TSC Multiplier = 0x%016llx\n", | ||
| 10518 | vmcs_read64(TSC_MULTIPLIER)); | ||
| 10519 | if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW) | ||
| 10520 | pr_err("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD)); | ||
| 10521 | if (pin_based_exec_ctrl & PIN_BASED_POSTED_INTR) | ||
| 10522 | pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV)); | ||
| 10523 | if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT)) | ||
| 10524 | pr_err("EPT pointer = 0x%016llx\n", vmcs_read64(EPT_POINTER)); | ||
| 10525 | n = vmcs_read32(CR3_TARGET_COUNT); | ||
| 10526 | for (i = 0; i + 1 < n; i += 4) | ||
| 10527 | pr_err("CR3 target%u=%016lx target%u=%016lx\n", | ||
| 10528 | i, vmcs_readl(CR3_TARGET_VALUE0 + i * 2), | ||
| 10529 | i + 1, vmcs_readl(CR3_TARGET_VALUE0 + i * 2 + 2)); | ||
| 10530 | if (i < n) | ||
| 10531 | pr_err("CR3 target%u=%016lx\n", | ||
| 10532 | i, vmcs_readl(CR3_TARGET_VALUE0 + i * 2)); | ||
| 10533 | if (secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING) | ||
| 10534 | pr_err("PLE Gap=%08x Window=%08x\n", | ||
| 10535 | vmcs_read32(PLE_GAP), vmcs_read32(PLE_WINDOW)); | ||
| 10536 | if (secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID) | ||
| 10537 | pr_err("Virtual processor ID = 0x%04x\n", | ||
| 10538 | vmcs_read16(VIRTUAL_PROCESSOR_ID)); | ||
| 10539 | } | ||
| 10540 | |||
| 10541 | /* | ||
| 10542 | * The guest has exited. See if we can fix it or if we need userspace | ||
| 10543 | * assistance. | ||
| 10544 | */ | ||
| 10545 | static int vmx_handle_exit(struct kvm_vcpu *vcpu) | ||
| 10546 | { | ||
| 10547 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 10548 | u32 exit_reason = vmx->exit_reason; | ||
| 10549 | u32 vectoring_info = vmx->idt_vectoring_info; | ||
| 10550 | |||
| 10551 | trace_kvm_exit(exit_reason, vcpu, KVM_ISA_VMX); | ||
| 10552 | |||
| 10553 | /* | ||
| 10554 | * Flush logged GPAs PML buffer, this will make dirty_bitmap more | ||
| 10555 | * updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before | ||
| 10556 | * querying dirty_bitmap, we only need to kick all vcpus out of guest | ||
| 10557 | * mode as if vcpus is in root mode, the PML buffer must has been | ||
| 10558 | * flushed already. | ||
| 10559 | */ | ||
| 10560 | if (enable_pml) | ||
| 10561 | vmx_flush_pml_buffer(vcpu); | ||
| 10562 | |||
| 10563 | /* If guest state is invalid, start emulating */ | ||
| 10564 | if (vmx->emulation_required) | ||
| 10565 | return handle_invalid_guest_state(vcpu); | ||
| 10566 | |||
| 10567 | if (is_guest_mode(vcpu) && nested_vmx_exit_reflected(vcpu, exit_reason)) | ||
| 10568 | return nested_vmx_reflect_vmexit(vcpu, exit_reason); | ||
| 10569 | |||
| 10570 | if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) { | ||
| 10571 | dump_vmcs(); | ||
| 10572 | vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; | ||
| 10573 | vcpu->run->fail_entry.hardware_entry_failure_reason | ||
| 10574 | = exit_reason; | ||
| 10575 | return 0; | ||
| 10576 | } | ||
| 10577 | |||
| 10578 | if (unlikely(vmx->fail)) { | ||
| 10579 | vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; | ||
| 10580 | vcpu->run->fail_entry.hardware_entry_failure_reason | ||
| 10581 | = vmcs_read32(VM_INSTRUCTION_ERROR); | ||
| 10582 | return 0; | ||
| 10583 | } | ||
| 10584 | |||
| 10585 | /* | ||
| 10586 | * Note: | ||
| 10587 | * Do not try to fix EXIT_REASON_EPT_MISCONFIG if it caused by | ||
| 10588 | * delivery event since it indicates guest is accessing MMIO. | ||
| 10589 | * The vm-exit can be triggered again after return to guest that | ||
| 10590 | * will cause infinite loop. | ||
| 10591 | */ | ||
| 10592 | if ((vectoring_info & VECTORING_INFO_VALID_MASK) && | ||
| 10593 | (exit_reason != EXIT_REASON_EXCEPTION_NMI && | ||
| 10594 | exit_reason != EXIT_REASON_EPT_VIOLATION && | ||
| 10595 | exit_reason != EXIT_REASON_PML_FULL && | ||
| 10596 | exit_reason != EXIT_REASON_TASK_SWITCH)) { | ||
| 10597 | vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; | ||
| 10598 | vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV; | ||
| 10599 | vcpu->run->internal.ndata = 3; | ||
| 10600 | vcpu->run->internal.data[0] = vectoring_info; | ||
| 10601 | vcpu->run->internal.data[1] = exit_reason; | ||
| 10602 | vcpu->run->internal.data[2] = vcpu->arch.exit_qualification; | ||
| 10603 | if (exit_reason == EXIT_REASON_EPT_MISCONFIG) { | ||
| 10604 | vcpu->run->internal.ndata++; | ||
| 10605 | vcpu->run->internal.data[3] = | ||
| 10606 | vmcs_read64(GUEST_PHYSICAL_ADDRESS); | ||
| 10607 | } | ||
| 10608 | return 0; | ||
| 10609 | } | ||
| 10610 | |||
| 10611 | if (unlikely(!enable_vnmi && | ||
| 10612 | vmx->loaded_vmcs->soft_vnmi_blocked)) { | ||
| 10613 | if (vmx_interrupt_allowed(vcpu)) { | ||
| 10614 | vmx->loaded_vmcs->soft_vnmi_blocked = 0; | ||
| 10615 | } else if (vmx->loaded_vmcs->vnmi_blocked_time > 1000000000LL && | ||
| 10616 | vcpu->arch.nmi_pending) { | ||
| 10617 | /* | ||
| 10618 | * This CPU don't support us in finding the end of an | ||
| 10619 | * NMI-blocked window if the guest runs with IRQs | ||
| 10620 | * disabled. So we pull the trigger after 1 s of | ||
| 10621 | * futile waiting, but inform the user about this. | ||
| 10622 | */ | ||
| 10623 | printk(KERN_WARNING "%s: Breaking out of NMI-blocked " | ||
| 10624 | "state on VCPU %d after 1 s timeout\n", | ||
| 10625 | __func__, vcpu->vcpu_id); | ||
| 10626 | vmx->loaded_vmcs->soft_vnmi_blocked = 0; | ||
| 10627 | } | ||
| 10628 | } | ||
| 10629 | |||
| 10630 | if (exit_reason < kvm_vmx_max_exit_handlers | ||
| 10631 | && kvm_vmx_exit_handlers[exit_reason]) | ||
| 10632 | return kvm_vmx_exit_handlers[exit_reason](vcpu); | ||
| 10633 | else { | ||
| 10634 | vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n", | ||
| 10635 | exit_reason); | ||
| 10636 | kvm_queue_exception(vcpu, UD_VECTOR); | ||
| 10637 | return 1; | ||
| 10638 | } | ||
| 10639 | } | ||
| 10640 | |||
| 10641 | /* | ||
| 10642 | * Software based L1D cache flush which is used when microcode providing | ||
| 10643 | * the cache control MSR is not loaded. | ||
| 10644 | * | ||
| 10645 | * The L1D cache is 32 KiB on Nehalem and later microarchitectures, but to | ||
| 10646 | * flush it is required to read in 64 KiB because the replacement algorithm | ||
| 10647 | * is not exactly LRU. This could be sized at runtime via topology | ||
| 10648 | * information but as all relevant affected CPUs have 32KiB L1D cache size | ||
| 10649 | * there is no point in doing so. | ||
| 10650 | */ | ||
| 10651 | static void vmx_l1d_flush(struct kvm_vcpu *vcpu) | ||
| 10652 | { | ||
| 10653 | int size = PAGE_SIZE << L1D_CACHE_ORDER; | ||
| 10654 | |||
| 10655 | /* | ||
| 10656 | * This code is only executed when the the flush mode is 'cond' or | ||
| 10657 | * 'always' | ||
| 10658 | */ | ||
| 10659 | if (static_branch_likely(&vmx_l1d_flush_cond)) { | ||
| 10660 | bool flush_l1d; | ||
| 10661 | |||
| 10662 | /* | ||
| 10663 | * Clear the per-vcpu flush bit, it gets set again | ||
| 10664 | * either from vcpu_run() or from one of the unsafe | ||
| 10665 | * VMEXIT handlers. | ||
| 10666 | */ | ||
| 10667 | flush_l1d = vcpu->arch.l1tf_flush_l1d; | ||
| 10668 | vcpu->arch.l1tf_flush_l1d = false; | ||
| 10669 | |||
| 10670 | /* | ||
| 10671 | * Clear the per-cpu flush bit, it gets set again from | ||
| 10672 | * the interrupt handlers. | ||
| 10673 | */ | ||
| 10674 | flush_l1d |= kvm_get_cpu_l1tf_flush_l1d(); | ||
| 10675 | kvm_clear_cpu_l1tf_flush_l1d(); | ||
| 10676 | |||
| 10677 | if (!flush_l1d) | ||
| 10678 | return; | ||
| 10679 | } | ||
| 10680 | |||
| 10681 | vcpu->stat.l1d_flush++; | ||
| 10682 | |||
| 10683 | if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) { | ||
| 10684 | wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH); | ||
| 10685 | return; | ||
| 10686 | } | ||
| 10687 | |||
| 10688 | asm volatile( | ||
| 10689 | /* First ensure the pages are in the TLB */ | ||
| 10690 | "xorl %%eax, %%eax\n" | ||
| 10691 | ".Lpopulate_tlb:\n\t" | ||
| 10692 | "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t" | ||
| 10693 | "addl $4096, %%eax\n\t" | ||
| 10694 | "cmpl %%eax, %[size]\n\t" | ||
| 10695 | "jne .Lpopulate_tlb\n\t" | ||
| 10696 | "xorl %%eax, %%eax\n\t" | ||
| 10697 | "cpuid\n\t" | ||
| 10698 | /* Now fill the cache */ | ||
| 10699 | "xorl %%eax, %%eax\n" | ||
| 10700 | ".Lfill_cache:\n" | ||
| 10701 | "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t" | ||
| 10702 | "addl $64, %%eax\n\t" | ||
| 10703 | "cmpl %%eax, %[size]\n\t" | ||
| 10704 | "jne .Lfill_cache\n\t" | ||
| 10705 | "lfence\n" | ||
| 10706 | :: [flush_pages] "r" (vmx_l1d_flush_pages), | ||
| 10707 | [size] "r" (size) | ||
| 10708 | : "eax", "ebx", "ecx", "edx"); | ||
| 10709 | } | ||
| 10710 | |||
| 10711 | static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) | ||
| 10712 | { | ||
| 10713 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); | ||
| 10714 | |||
| 10715 | if (is_guest_mode(vcpu) && | ||
| 10716 | nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) | ||
| 10717 | return; | ||
| 10718 | |||
| 10719 | if (irr == -1 || tpr < irr) { | ||
| 10720 | vmcs_write32(TPR_THRESHOLD, 0); | ||
| 10721 | return; | ||
| 10722 | } | ||
| 10723 | |||
| 10724 | vmcs_write32(TPR_THRESHOLD, irr); | ||
| 10725 | } | ||
| 10726 | |||
| 10727 | static void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu) | ||
| 10728 | { | ||
| 10729 | u32 sec_exec_control; | ||
| 10730 | |||
| 10731 | if (!lapic_in_kernel(vcpu)) | ||
| 10732 | return; | ||
| 10733 | |||
| 10734 | if (!flexpriority_enabled && | ||
| 10735 | !cpu_has_vmx_virtualize_x2apic_mode()) | ||
| 10736 | return; | ||
| 10737 | |||
| 10738 | /* Postpone execution until vmcs01 is the current VMCS. */ | ||
| 10739 | if (is_guest_mode(vcpu)) { | ||
| 10740 | to_vmx(vcpu)->nested.change_vmcs01_virtual_apic_mode = true; | ||
| 10741 | return; | ||
| 10742 | } | ||
| 10743 | |||
| 10744 | sec_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); | ||
| 10745 | sec_exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | | ||
| 10746 | SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE); | ||
| 10747 | |||
| 10748 | switch (kvm_get_apic_mode(vcpu)) { | ||
| 10749 | case LAPIC_MODE_INVALID: | ||
| 10750 | WARN_ONCE(true, "Invalid local APIC state"); | ||
| 10751 | case LAPIC_MODE_DISABLED: | ||
| 10752 | break; | ||
| 10753 | case LAPIC_MODE_XAPIC: | ||
| 10754 | if (flexpriority_enabled) { | ||
| 10755 | sec_exec_control |= | ||
| 10756 | SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; | ||
| 10757 | vmx_flush_tlb(vcpu, true); | ||
| 10758 | } | ||
| 10759 | break; | ||
| 10760 | case LAPIC_MODE_X2APIC: | ||
| 10761 | if (cpu_has_vmx_virtualize_x2apic_mode()) | ||
| 10762 | sec_exec_control |= | ||
| 10763 | SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; | ||
| 10764 | break; | ||
| 10765 | } | ||
| 10766 | vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control); | ||
| 10767 | |||
| 10768 | vmx_update_msr_bitmap(vcpu); | ||
| 10769 | } | ||
| 10770 | |||
| 10771 | static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa) | ||
| 10772 | { | ||
| 10773 | if (!is_guest_mode(vcpu)) { | ||
| 10774 | vmcs_write64(APIC_ACCESS_ADDR, hpa); | ||
| 10775 | vmx_flush_tlb(vcpu, true); | ||
| 10776 | } | ||
| 10777 | } | ||
| 10778 | |||
| 10779 | static void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr) | ||
| 10780 | { | ||
| 10781 | u16 status; | ||
| 10782 | u8 old; | ||
| 10783 | |||
| 10784 | if (max_isr == -1) | ||
| 10785 | max_isr = 0; | ||
| 10786 | |||
| 10787 | status = vmcs_read16(GUEST_INTR_STATUS); | ||
| 10788 | old = status >> 8; | ||
| 10789 | if (max_isr != old) { | ||
| 10790 | status &= 0xff; | ||
| 10791 | status |= max_isr << 8; | ||
| 10792 | vmcs_write16(GUEST_INTR_STATUS, status); | ||
| 10793 | } | ||
| 10794 | } | ||
| 10795 | |||
| 10796 | static void vmx_set_rvi(int vector) | ||
| 10797 | { | ||
| 10798 | u16 status; | ||
| 10799 | u8 old; | ||
| 10800 | |||
| 10801 | if (vector == -1) | ||
| 10802 | vector = 0; | ||
| 10803 | |||
| 10804 | status = vmcs_read16(GUEST_INTR_STATUS); | ||
| 10805 | old = (u8)status & 0xff; | ||
| 10806 | if ((u8)vector != old) { | ||
| 10807 | status &= ~0xff; | ||
| 10808 | status |= (u8)vector; | ||
| 10809 | vmcs_write16(GUEST_INTR_STATUS, status); | ||
| 10810 | } | ||
| 10811 | } | ||
| 10812 | |||
| 10813 | static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr) | ||
| 10814 | { | ||
| 10815 | /* | ||
| 10816 | * When running L2, updating RVI is only relevant when | ||
| 10817 | * vmcs12 virtual-interrupt-delivery enabled. | ||
| 10818 | * However, it can be enabled only when L1 also | ||
| 10819 | * intercepts external-interrupts and in that case | ||
| 10820 | * we should not update vmcs02 RVI but instead intercept | ||
| 10821 | * interrupt. Therefore, do nothing when running L2. | ||
| 10822 | */ | ||
| 10823 | if (!is_guest_mode(vcpu)) | ||
| 10824 | vmx_set_rvi(max_irr); | ||
| 10825 | } | ||
| 10826 | |||
| 10827 | static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) | ||
| 10828 | { | ||
| 10829 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 10830 | int max_irr; | ||
| 10831 | bool max_irr_updated; | ||
| 10832 | |||
| 10833 | WARN_ON(!vcpu->arch.apicv_active); | ||
| 10834 | if (pi_test_on(&vmx->pi_desc)) { | ||
| 10835 | pi_clear_on(&vmx->pi_desc); | ||
| 10836 | /* | ||
| 10837 | * IOMMU can write to PIR.ON, so the barrier matters even on UP. | ||
| 10838 | * But on x86 this is just a compiler barrier anyway. | ||
| 10839 | */ | ||
| 10840 | smp_mb__after_atomic(); | ||
| 10841 | max_irr_updated = | ||
| 10842 | kvm_apic_update_irr(vcpu, vmx->pi_desc.pir, &max_irr); | ||
| 10843 | |||
| 10844 | /* | ||
| 10845 | * If we are running L2 and L1 has a new pending interrupt | ||
| 10846 | * which can be injected, we should re-evaluate | ||
| 10847 | * what should be done with this new L1 interrupt. | ||
| 10848 | * If L1 intercepts external-interrupts, we should | ||
| 10849 | * exit from L2 to L1. Otherwise, interrupt should be | ||
| 10850 | * delivered directly to L2. | ||
| 10851 | */ | ||
| 10852 | if (is_guest_mode(vcpu) && max_irr_updated) { | ||
| 10853 | if (nested_exit_on_intr(vcpu)) | ||
| 10854 | kvm_vcpu_exiting_guest_mode(vcpu); | ||
| 10855 | else | ||
| 10856 | kvm_make_request(KVM_REQ_EVENT, vcpu); | ||
| 10857 | } | ||
| 10858 | } else { | ||
| 10859 | max_irr = kvm_lapic_find_highest_irr(vcpu); | ||
| 10860 | } | ||
| 10861 | vmx_hwapic_irr_update(vcpu, max_irr); | ||
| 10862 | return max_irr; | ||
| 10863 | } | ||
| 10864 | |||
| 10865 | static u8 vmx_has_apicv_interrupt(struct kvm_vcpu *vcpu) | ||
| 10866 | { | ||
| 10867 | u8 rvi = vmx_get_rvi(); | ||
| 10868 | u8 vppr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_PROCPRI); | ||
| 10869 | |||
| 10870 | return ((rvi & 0xf0) > (vppr & 0xf0)); | ||
| 10871 | } | ||
| 10872 | |||
| 10873 | static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) | ||
| 10874 | { | ||
| 10875 | if (!kvm_vcpu_apicv_active(vcpu)) | ||
| 10876 | return; | ||
| 10877 | |||
| 10878 | vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]); | ||
| 10879 | vmcs_write64(EOI_EXIT_BITMAP1, eoi_exit_bitmap[1]); | ||
| 10880 | vmcs_write64(EOI_EXIT_BITMAP2, eoi_exit_bitmap[2]); | ||
| 10881 | vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]); | ||
| 10882 | } | ||
| 10883 | |||
| 10884 | static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu) | ||
| 10885 | { | ||
| 10886 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 10887 | |||
| 10888 | pi_clear_on(&vmx->pi_desc); | ||
| 10889 | memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir)); | ||
| 10890 | } | ||
| 10891 | |||
| 10892 | static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) | ||
| 10893 | { | ||
| 10894 | u32 exit_intr_info = 0; | ||
| 10895 | u16 basic_exit_reason = (u16)vmx->exit_reason; | ||
| 10896 | |||
| 10897 | if (!(basic_exit_reason == EXIT_REASON_MCE_DURING_VMENTRY | ||
| 10898 | || basic_exit_reason == EXIT_REASON_EXCEPTION_NMI)) | ||
| 10899 | return; | ||
| 10900 | |||
| 10901 | if (!(vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) | ||
| 10902 | exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); | ||
| 10903 | vmx->exit_intr_info = exit_intr_info; | ||
| 10904 | |||
| 10905 | /* if exit due to PF check for async PF */ | ||
| 10906 | if (is_page_fault(exit_intr_info)) | ||
| 10907 | vmx->vcpu.arch.apf.host_apf_reason = kvm_read_and_reset_pf_reason(); | ||
| 10908 | |||
| 10909 | /* Handle machine checks before interrupts are enabled */ | ||
| 10910 | if (basic_exit_reason == EXIT_REASON_MCE_DURING_VMENTRY || | ||
| 10911 | is_machine_check(exit_intr_info)) | ||
| 10912 | kvm_machine_check(); | ||
| 10913 | |||
| 10914 | /* We need to handle NMIs before interrupts are enabled */ | ||
| 10915 | if (is_nmi(exit_intr_info)) { | ||
| 10916 | kvm_before_interrupt(&vmx->vcpu); | ||
| 10917 | asm("int $2"); | ||
| 10918 | kvm_after_interrupt(&vmx->vcpu); | ||
| 10919 | } | ||
| 10920 | } | ||
| 10921 | |||
| 10922 | static void vmx_handle_external_intr(struct kvm_vcpu *vcpu) | ||
| 10923 | { | ||
| 10924 | u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); | ||
| 10925 | |||
| 10926 | if ((exit_intr_info & (INTR_INFO_VALID_MASK | INTR_INFO_INTR_TYPE_MASK)) | ||
| 10927 | == (INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR)) { | ||
| 10928 | unsigned int vector; | ||
| 10929 | unsigned long entry; | ||
| 10930 | gate_desc *desc; | ||
| 10931 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 10932 | #ifdef CONFIG_X86_64 | ||
| 10933 | unsigned long tmp; | ||
| 10934 | #endif | ||
| 10935 | |||
| 10936 | vector = exit_intr_info & INTR_INFO_VECTOR_MASK; | ||
| 10937 | desc = (gate_desc *)vmx->host_idt_base + vector; | ||
| 10938 | entry = gate_offset(desc); | ||
| 10939 | asm volatile( | ||
| 10940 | #ifdef CONFIG_X86_64 | ||
| 10941 | "mov %%" _ASM_SP ", %[sp]\n\t" | ||
| 10942 | "and $0xfffffffffffffff0, %%" _ASM_SP "\n\t" | ||
| 10943 | "push $%c[ss]\n\t" | ||
| 10944 | "push %[sp]\n\t" | ||
| 10945 | #endif | ||
| 10946 | "pushf\n\t" | ||
| 10947 | __ASM_SIZE(push) " $%c[cs]\n\t" | ||
| 10948 | CALL_NOSPEC | ||
| 10949 | : | ||
| 10950 | #ifdef CONFIG_X86_64 | ||
| 10951 | [sp]"=&r"(tmp), | ||
| 10952 | #endif | ||
| 10953 | ASM_CALL_CONSTRAINT | ||
| 10954 | : | ||
| 10955 | THUNK_TARGET(entry), | ||
| 10956 | [ss]"i"(__KERNEL_DS), | ||
| 10957 | [cs]"i"(__KERNEL_CS) | ||
| 10958 | ); | ||
| 10959 | } | ||
| 10960 | } | ||
| 10961 | STACK_FRAME_NON_STANDARD(vmx_handle_external_intr); | ||
| 10962 | |||
| 10963 | static bool vmx_has_emulated_msr(int index) | ||
| 10964 | { | ||
| 10965 | switch (index) { | ||
| 10966 | case MSR_IA32_SMBASE: | ||
| 10967 | /* | ||
| 10968 | * We cannot do SMM unless we can run the guest in big | ||
| 10969 | * real mode. | ||
| 10970 | */ | ||
| 10971 | return enable_unrestricted_guest || emulate_invalid_guest_state; | ||
| 10972 | case MSR_AMD64_VIRT_SPEC_CTRL: | ||
| 10973 | /* This is AMD only. */ | ||
| 10974 | return false; | ||
| 10975 | default: | ||
| 10976 | return true; | ||
| 10977 | } | ||
| 10978 | } | ||
| 10979 | |||
| 10980 | static bool vmx_mpx_supported(void) | ||
| 10981 | { | ||
| 10982 | return (vmcs_config.vmexit_ctrl & VM_EXIT_CLEAR_BNDCFGS) && | ||
| 10983 | (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_BNDCFGS); | ||
| 10984 | } | ||
| 10985 | |||
| 10986 | static bool vmx_xsaves_supported(void) | ||
| 10987 | { | ||
| 10988 | return vmcs_config.cpu_based_2nd_exec_ctrl & | ||
| 10989 | SECONDARY_EXEC_XSAVES; | ||
| 10990 | } | ||
| 10991 | |||
| 10992 | static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) | ||
| 10993 | { | ||
| 10994 | u32 exit_intr_info; | ||
| 10995 | bool unblock_nmi; | ||
| 10996 | u8 vector; | ||
| 10997 | bool idtv_info_valid; | ||
| 10998 | |||
| 10999 | idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK; | ||
| 11000 | |||
| 11001 | if (enable_vnmi) { | ||
| 11002 | if (vmx->loaded_vmcs->nmi_known_unmasked) | ||
| 11003 | return; | ||
| 11004 | /* | ||
| 11005 | * Can't use vmx->exit_intr_info since we're not sure what | ||
| 11006 | * the exit reason is. | ||
| 11007 | */ | ||
| 11008 | exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); | ||
| 11009 | unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0; | ||
| 11010 | vector = exit_intr_info & INTR_INFO_VECTOR_MASK; | ||
| 11011 | /* | ||
| 11012 | * SDM 3: 27.7.1.2 (September 2008) | ||
| 11013 | * Re-set bit "block by NMI" before VM entry if vmexit caused by | ||
| 11014 | * a guest IRET fault. | ||
| 11015 | * SDM 3: 23.2.2 (September 2008) | ||
| 11016 | * Bit 12 is undefined in any of the following cases: | ||
| 11017 | * If the VM exit sets the valid bit in the IDT-vectoring | ||
| 11018 | * information field. | ||
| 11019 | * If the VM exit is due to a double fault. | ||
| 11020 | */ | ||
| 11021 | if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi && | ||
| 11022 | vector != DF_VECTOR && !idtv_info_valid) | ||
| 11023 | vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, | ||
| 11024 | GUEST_INTR_STATE_NMI); | ||
| 11025 | else | ||
| 11026 | vmx->loaded_vmcs->nmi_known_unmasked = | ||
| 11027 | !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) | ||
| 11028 | & GUEST_INTR_STATE_NMI); | ||
| 11029 | } else if (unlikely(vmx->loaded_vmcs->soft_vnmi_blocked)) | ||
| 11030 | vmx->loaded_vmcs->vnmi_blocked_time += | ||
| 11031 | ktime_to_ns(ktime_sub(ktime_get(), | ||
| 11032 | vmx->loaded_vmcs->entry_time)); | ||
| 11033 | } | ||
| 11034 | |||
| 11035 | static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu, | ||
| 11036 | u32 idt_vectoring_info, | ||
| 11037 | int instr_len_field, | ||
| 11038 | int error_code_field) | ||
| 11039 | { | ||
| 11040 | u8 vector; | ||
| 11041 | int type; | ||
| 11042 | bool idtv_info_valid; | ||
| 11043 | |||
| 11044 | idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; | ||
| 11045 | |||
| 11046 | vcpu->arch.nmi_injected = false; | ||
| 11047 | kvm_clear_exception_queue(vcpu); | ||
| 11048 | kvm_clear_interrupt_queue(vcpu); | ||
| 11049 | |||
| 11050 | if (!idtv_info_valid) | ||
| 11051 | return; | ||
| 11052 | |||
| 11053 | kvm_make_request(KVM_REQ_EVENT, vcpu); | ||
| 11054 | |||
| 11055 | vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK; | ||
| 11056 | type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK; | ||
| 11057 | |||
| 11058 | switch (type) { | ||
| 11059 | case INTR_TYPE_NMI_INTR: | ||
| 11060 | vcpu->arch.nmi_injected = true; | ||
| 11061 | /* | ||
| 11062 | * SDM 3: 27.7.1.2 (September 2008) | ||
| 11063 | * Clear bit "block by NMI" before VM entry if a NMI | ||
| 11064 | * delivery faulted. | ||
| 11065 | */ | ||
| 11066 | vmx_set_nmi_mask(vcpu, false); | ||
| 11067 | break; | ||
| 11068 | case INTR_TYPE_SOFT_EXCEPTION: | ||
| 11069 | vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field); | ||
| 11070 | /* fall through */ | ||
| 11071 | case INTR_TYPE_HARD_EXCEPTION: | ||
| 11072 | if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) { | ||
| 11073 | u32 err = vmcs_read32(error_code_field); | ||
| 11074 | kvm_requeue_exception_e(vcpu, vector, err); | ||
| 11075 | } else | ||
| 11076 | kvm_requeue_exception(vcpu, vector); | ||
| 11077 | break; | ||
| 11078 | case INTR_TYPE_SOFT_INTR: | ||
| 11079 | vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field); | ||
| 11080 | /* fall through */ | ||
| 11081 | case INTR_TYPE_EXT_INTR: | ||
| 11082 | kvm_queue_interrupt(vcpu, vector, type == INTR_TYPE_SOFT_INTR); | ||
| 11083 | break; | ||
| 11084 | default: | ||
| 11085 | break; | ||
| 11086 | } | ||
| 11087 | } | ||
| 11088 | |||
| 11089 | static void vmx_complete_interrupts(struct vcpu_vmx *vmx) | ||
| 11090 | { | ||
| 11091 | __vmx_complete_interrupts(&vmx->vcpu, vmx->idt_vectoring_info, | ||
| 11092 | VM_EXIT_INSTRUCTION_LEN, | ||
| 11093 | IDT_VECTORING_ERROR_CODE); | ||
| 11094 | } | ||
| 11095 | |||
| 11096 | static void vmx_cancel_injection(struct kvm_vcpu *vcpu) | ||
| 11097 | { | ||
| 11098 | __vmx_complete_interrupts(vcpu, | ||
| 11099 | vmcs_read32(VM_ENTRY_INTR_INFO_FIELD), | ||
| 11100 | VM_ENTRY_INSTRUCTION_LEN, | ||
| 11101 | VM_ENTRY_EXCEPTION_ERROR_CODE); | ||
| 11102 | |||
| 11103 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); | ||
| 11104 | } | ||
| 11105 | |||
| 11106 | static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) | ||
| 11107 | { | ||
| 11108 | int i, nr_msrs; | ||
| 11109 | struct perf_guest_switch_msr *msrs; | ||
| 11110 | |||
| 11111 | msrs = perf_guest_get_msrs(&nr_msrs); | ||
| 11112 | |||
| 11113 | if (!msrs) | ||
| 11114 | return; | ||
| 11115 | |||
| 11116 | for (i = 0; i < nr_msrs; i++) | ||
| 11117 | if (msrs[i].host == msrs[i].guest) | ||
| 11118 | clear_atomic_switch_msr(vmx, msrs[i].msr); | ||
| 11119 | else | ||
| 11120 | add_atomic_switch_msr(vmx, msrs[i].msr, msrs[i].guest, | ||
| 11121 | msrs[i].host, false); | ||
| 11122 | } | ||
| 11123 | |||
| 11124 | static void vmx_arm_hv_timer(struct vcpu_vmx *vmx, u32 val) | ||
| 11125 | { | ||
| 11126 | vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, val); | ||
| 11127 | if (!vmx->loaded_vmcs->hv_timer_armed) | ||
| 11128 | vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL, | ||
| 11129 | PIN_BASED_VMX_PREEMPTION_TIMER); | ||
| 11130 | vmx->loaded_vmcs->hv_timer_armed = true; | ||
| 11131 | } | ||
| 11132 | |||
| 11133 | static void vmx_update_hv_timer(struct kvm_vcpu *vcpu) | ||
| 11134 | { | ||
| 11135 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 11136 | u64 tscl; | ||
| 11137 | u32 delta_tsc; | ||
| 11138 | |||
| 11139 | if (vmx->req_immediate_exit) { | ||
| 11140 | vmx_arm_hv_timer(vmx, 0); | ||
| 11141 | return; | ||
| 11142 | } | ||
| 11143 | |||
| 11144 | if (vmx->hv_deadline_tsc != -1) { | ||
| 11145 | tscl = rdtsc(); | ||
| 11146 | if (vmx->hv_deadline_tsc > tscl) | ||
| 11147 | /* set_hv_timer ensures the delta fits in 32-bits */ | ||
| 11148 | delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >> | ||
| 11149 | cpu_preemption_timer_multi); | ||
| 11150 | else | ||
| 11151 | delta_tsc = 0; | ||
| 11152 | |||
| 11153 | vmx_arm_hv_timer(vmx, delta_tsc); | ||
| 11154 | return; | ||
| 11155 | } | ||
| 11156 | |||
| 11157 | if (vmx->loaded_vmcs->hv_timer_armed) | ||
| 11158 | vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL, | ||
| 11159 | PIN_BASED_VMX_PREEMPTION_TIMER); | ||
| 11160 | vmx->loaded_vmcs->hv_timer_armed = false; | ||
| 11161 | } | ||
| 11162 | |||
| 11163 | static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) | ||
| 11164 | { | ||
| 11165 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 11166 | unsigned long cr3, cr4, evmcs_rsp; | ||
| 11167 | |||
| 11168 | /* Record the guest's net vcpu time for enforced NMI injections. */ | ||
| 11169 | if (unlikely(!enable_vnmi && | ||
| 11170 | vmx->loaded_vmcs->soft_vnmi_blocked)) | ||
| 11171 | vmx->loaded_vmcs->entry_time = ktime_get(); | ||
| 11172 | |||
| 11173 | /* Don't enter VMX if guest state is invalid, let the exit handler | ||
| 11174 | start emulation until we arrive back to a valid state */ | ||
| 11175 | if (vmx->emulation_required) | ||
| 11176 | return; | ||
| 11177 | |||
| 11178 | if (vmx->ple_window_dirty) { | ||
| 11179 | vmx->ple_window_dirty = false; | ||
| 11180 | vmcs_write32(PLE_WINDOW, vmx->ple_window); | ||
| 11181 | } | ||
| 11182 | |||
| 11183 | if (vmx->nested.need_vmcs12_sync) { | ||
| 11184 | /* | ||
| 11185 | * hv_evmcs may end up being not mapped after migration (when | ||
| 11186 | * L2 was running), map it here to make sure vmcs12 changes are | ||
| 11187 | * properly reflected. | ||
| 11188 | */ | ||
| 11189 | if (vmx->nested.enlightened_vmcs_enabled && | ||
| 11190 | !vmx->nested.hv_evmcs) | ||
| 11191 | nested_vmx_handle_enlightened_vmptrld(vcpu, false); | ||
| 11192 | |||
| 11193 | if (vmx->nested.hv_evmcs) { | ||
| 11194 | copy_vmcs12_to_enlightened(vmx); | ||
| 11195 | /* All fields are clean */ | ||
| 11196 | vmx->nested.hv_evmcs->hv_clean_fields |= | ||
| 11197 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; | ||
| 11198 | } else { | ||
| 11199 | copy_vmcs12_to_shadow(vmx); | ||
| 11200 | } | ||
| 11201 | vmx->nested.need_vmcs12_sync = false; | ||
| 11202 | } | ||
| 11203 | |||
| 11204 | if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty)) | ||
| 11205 | vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); | ||
| 11206 | if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty)) | ||
| 11207 | vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); | ||
| 11208 | |||
| 11209 | cr3 = __get_current_cr3_fast(); | ||
| 11210 | if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) { | ||
| 11211 | vmcs_writel(HOST_CR3, cr3); | ||
| 11212 | vmx->loaded_vmcs->host_state.cr3 = cr3; | ||
| 11213 | } | ||
| 11214 | |||
| 11215 | cr4 = cr4_read_shadow(); | ||
| 11216 | if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) { | ||
| 11217 | vmcs_writel(HOST_CR4, cr4); | ||
| 11218 | vmx->loaded_vmcs->host_state.cr4 = cr4; | ||
| 11219 | } | ||
| 11220 | |||
| 11221 | /* When single-stepping over STI and MOV SS, we must clear the | ||
| 11222 | * corresponding interruptibility bits in the guest state. Otherwise | ||
| 11223 | * vmentry fails as it then expects bit 14 (BS) in pending debug | ||
| 11224 | * exceptions being set, but that's not correct for the guest debugging | ||
| 11225 | * case. */ | ||
| 11226 | if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) | ||
| 11227 | vmx_set_interrupt_shadow(vcpu, 0); | ||
| 11228 | |||
| 11229 | if (static_cpu_has(X86_FEATURE_PKU) && | ||
| 11230 | kvm_read_cr4_bits(vcpu, X86_CR4_PKE) && | ||
| 11231 | vcpu->arch.pkru != vmx->host_pkru) | ||
| 11232 | __write_pkru(vcpu->arch.pkru); | ||
| 11233 | |||
| 11234 | atomic_switch_perf_msrs(vmx); | ||
| 11235 | |||
| 11236 | vmx_update_hv_timer(vcpu); | ||
| 11237 | |||
| 11238 | /* | ||
| 11239 | * If this vCPU has touched SPEC_CTRL, restore the guest's value if | ||
| 11240 | * it's non-zero. Since vmentry is serialising on affected CPUs, there | ||
| 11241 | * is no need to worry about the conditional branch over the wrmsr | ||
| 11242 | * being speculatively taken. | ||
| 11243 | */ | ||
| 11244 | x86_spec_ctrl_set_guest(vmx->spec_ctrl, 0); | ||
| 11245 | |||
| 11246 | vmx->__launched = vmx->loaded_vmcs->launched; | ||
| 11247 | |||
| 11248 | evmcs_rsp = static_branch_unlikely(&enable_evmcs) ? | ||
| 11249 | (unsigned long)¤t_evmcs->host_rsp : 0; | ||
| 11250 | |||
| 11251 | if (static_branch_unlikely(&vmx_l1d_should_flush)) | ||
| 11252 | vmx_l1d_flush(vcpu); | ||
| 11253 | |||
| 11254 | asm( | ||
| 11255 | /* Store host registers */ | ||
| 11256 | "push %%" _ASM_DX "; push %%" _ASM_BP ";" | ||
| 11257 | "push %%" _ASM_CX " \n\t" /* placeholder for guest rcx */ | ||
| 11258 | "push %%" _ASM_CX " \n\t" | ||
| 11259 | "cmp %%" _ASM_SP ", %c[host_rsp](%0) \n\t" | ||
| 11260 | "je 1f \n\t" | ||
| 11261 | "mov %%" _ASM_SP ", %c[host_rsp](%0) \n\t" | ||
| 11262 | /* Avoid VMWRITE when Enlightened VMCS is in use */ | ||
| 11263 | "test %%" _ASM_SI ", %%" _ASM_SI " \n\t" | ||
| 11264 | "jz 2f \n\t" | ||
| 11265 | "mov %%" _ASM_SP ", (%%" _ASM_SI ") \n\t" | ||
| 11266 | "jmp 1f \n\t" | ||
| 11267 | "2: \n\t" | ||
| 11268 | __ex("vmwrite %%" _ASM_SP ", %%" _ASM_DX) "\n\t" | ||
| 11269 | "1: \n\t" | ||
| 11270 | /* Reload cr2 if changed */ | ||
| 11271 | "mov %c[cr2](%0), %%" _ASM_AX " \n\t" | ||
| 11272 | "mov %%cr2, %%" _ASM_DX " \n\t" | ||
| 11273 | "cmp %%" _ASM_AX ", %%" _ASM_DX " \n\t" | ||
| 11274 | "je 3f \n\t" | ||
| 11275 | "mov %%" _ASM_AX", %%cr2 \n\t" | ||
| 11276 | "3: \n\t" | ||
| 11277 | /* Check if vmlaunch of vmresume is needed */ | ||
| 11278 | "cmpl $0, %c[launched](%0) \n\t" | ||
| 11279 | /* Load guest registers. Don't clobber flags. */ | ||
| 11280 | "mov %c[rax](%0), %%" _ASM_AX " \n\t" | ||
| 11281 | "mov %c[rbx](%0), %%" _ASM_BX " \n\t" | ||
| 11282 | "mov %c[rdx](%0), %%" _ASM_DX " \n\t" | ||
| 11283 | "mov %c[rsi](%0), %%" _ASM_SI " \n\t" | ||
| 11284 | "mov %c[rdi](%0), %%" _ASM_DI " \n\t" | ||
| 11285 | "mov %c[rbp](%0), %%" _ASM_BP " \n\t" | ||
| 11286 | #ifdef CONFIG_X86_64 | ||
| 11287 | "mov %c[r8](%0), %%r8 \n\t" | ||
| 11288 | "mov %c[r9](%0), %%r9 \n\t" | ||
| 11289 | "mov %c[r10](%0), %%r10 \n\t" | ||
| 11290 | "mov %c[r11](%0), %%r11 \n\t" | ||
| 11291 | "mov %c[r12](%0), %%r12 \n\t" | ||
| 11292 | "mov %c[r13](%0), %%r13 \n\t" | ||
| 11293 | "mov %c[r14](%0), %%r14 \n\t" | ||
| 11294 | "mov %c[r15](%0), %%r15 \n\t" | ||
| 11295 | #endif | ||
| 11296 | "mov %c[rcx](%0), %%" _ASM_CX " \n\t" /* kills %0 (ecx) */ | ||
| 11297 | |||
| 11298 | /* Enter guest mode */ | ||
| 11299 | "jne 1f \n\t" | ||
| 11300 | __ex("vmlaunch") "\n\t" | ||
| 11301 | "jmp 2f \n\t" | ||
| 11302 | "1: " __ex("vmresume") "\n\t" | ||
| 11303 | "2: " | ||
| 11304 | /* Save guest registers, load host registers, keep flags */ | ||
| 11305 | "mov %0, %c[wordsize](%%" _ASM_SP ") \n\t" | ||
| 11306 | "pop %0 \n\t" | ||
| 11307 | "setbe %c[fail](%0)\n\t" | ||
| 11308 | "mov %%" _ASM_AX ", %c[rax](%0) \n\t" | ||
| 11309 | "mov %%" _ASM_BX ", %c[rbx](%0) \n\t" | ||
| 11310 | __ASM_SIZE(pop) " %c[rcx](%0) \n\t" | ||
| 11311 | "mov %%" _ASM_DX ", %c[rdx](%0) \n\t" | ||
| 11312 | "mov %%" _ASM_SI ", %c[rsi](%0) \n\t" | ||
| 11313 | "mov %%" _ASM_DI ", %c[rdi](%0) \n\t" | ||
| 11314 | "mov %%" _ASM_BP ", %c[rbp](%0) \n\t" | ||
| 11315 | #ifdef CONFIG_X86_64 | ||
| 11316 | "mov %%r8, %c[r8](%0) \n\t" | ||
| 11317 | "mov %%r9, %c[r9](%0) \n\t" | ||
| 11318 | "mov %%r10, %c[r10](%0) \n\t" | ||
| 11319 | "mov %%r11, %c[r11](%0) \n\t" | ||
| 11320 | "mov %%r12, %c[r12](%0) \n\t" | ||
| 11321 | "mov %%r13, %c[r13](%0) \n\t" | ||
| 11322 | "mov %%r14, %c[r14](%0) \n\t" | ||
| 11323 | "mov %%r15, %c[r15](%0) \n\t" | ||
| 11324 | /* | ||
| 11325 | * Clear host registers marked as clobbered to prevent | ||
| 11326 | * speculative use. | ||
| 11327 | */ | ||
| 11328 | "xor %%r8d, %%r8d \n\t" | ||
| 11329 | "xor %%r9d, %%r9d \n\t" | ||
| 11330 | "xor %%r10d, %%r10d \n\t" | ||
| 11331 | "xor %%r11d, %%r11d \n\t" | ||
| 11332 | "xor %%r12d, %%r12d \n\t" | ||
| 11333 | "xor %%r13d, %%r13d \n\t" | ||
| 11334 | "xor %%r14d, %%r14d \n\t" | ||
| 11335 | "xor %%r15d, %%r15d \n\t" | ||
| 11336 | #endif | ||
| 11337 | "mov %%cr2, %%" _ASM_AX " \n\t" | ||
| 11338 | "mov %%" _ASM_AX ", %c[cr2](%0) \n\t" | ||
| 11339 | |||
| 11340 | "xor %%eax, %%eax \n\t" | ||
| 11341 | "xor %%ebx, %%ebx \n\t" | ||
| 11342 | "xor %%esi, %%esi \n\t" | ||
| 11343 | "xor %%edi, %%edi \n\t" | ||
| 11344 | "pop %%" _ASM_BP "; pop %%" _ASM_DX " \n\t" | ||
| 11345 | ".pushsection .rodata \n\t" | ||
| 11346 | ".global vmx_return \n\t" | ||
| 11347 | "vmx_return: " _ASM_PTR " 2b \n\t" | ||
| 11348 | ".popsection" | ||
| 11349 | : : "c"(vmx), "d"((unsigned long)HOST_RSP), "S"(evmcs_rsp), | ||
| 11350 | [launched]"i"(offsetof(struct vcpu_vmx, __launched)), | ||
| 11351 | [fail]"i"(offsetof(struct vcpu_vmx, fail)), | ||
| 11352 | [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)), | ||
| 11353 | [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])), | ||
| 11354 | [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])), | ||
| 11355 | [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])), | ||
| 11356 | [rdx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDX])), | ||
| 11357 | [rsi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RSI])), | ||
| 11358 | [rdi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDI])), | ||
| 11359 | [rbp]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBP])), | ||
| 11360 | #ifdef CONFIG_X86_64 | ||
| 11361 | [r8]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R8])), | ||
| 11362 | [r9]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R9])), | ||
| 11363 | [r10]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R10])), | ||
| 11364 | [r11]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R11])), | ||
| 11365 | [r12]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R12])), | ||
| 11366 | [r13]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R13])), | ||
| 11367 | [r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])), | ||
| 11368 | [r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])), | ||
| 11369 | #endif | ||
| 11370 | [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)), | ||
| 11371 | [wordsize]"i"(sizeof(ulong)) | ||
| 11372 | : "cc", "memory" | ||
| 11373 | #ifdef CONFIG_X86_64 | ||
| 11374 | , "rax", "rbx", "rdi" | ||
| 11375 | , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" | ||
| 11376 | #else | ||
| 11377 | , "eax", "ebx", "edi" | ||
| 11378 | #endif | ||
| 11379 | ); | ||
| 11380 | |||
| 11381 | /* | ||
| 11382 | * We do not use IBRS in the kernel. If this vCPU has used the | ||
| 11383 | * SPEC_CTRL MSR it may have left it on; save the value and | ||
| 11384 | * turn it off. This is much more efficient than blindly adding | ||
| 11385 | * it to the atomic save/restore list. Especially as the former | ||
| 11386 | * (Saving guest MSRs on vmexit) doesn't even exist in KVM. | ||
| 11387 | * | ||
| 11388 | * For non-nested case: | ||
| 11389 | * If the L01 MSR bitmap does not intercept the MSR, then we need to | ||
| 11390 | * save it. | ||
| 11391 | * | ||
| 11392 | * For nested case: | ||
| 11393 | * If the L02 MSR bitmap does not intercept the MSR, then we need to | ||
| 11394 | * save it. | ||
| 11395 | */ | ||
| 11396 | if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL))) | ||
| 11397 | vmx->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL); | ||
| 11398 | |||
| 11399 | x86_spec_ctrl_restore_host(vmx->spec_ctrl, 0); | ||
| 11400 | |||
| 11401 | /* Eliminate branch target predictions from guest mode */ | ||
| 11402 | vmexit_fill_RSB(); | ||
| 11403 | |||
| 11404 | /* All fields are clean at this point */ | ||
| 11405 | if (static_branch_unlikely(&enable_evmcs)) | ||
| 11406 | current_evmcs->hv_clean_fields |= | ||
| 11407 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; | ||
| 11408 | |||
| 11409 | /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */ | ||
| 11410 | if (vmx->host_debugctlmsr) | ||
| 11411 | update_debugctlmsr(vmx->host_debugctlmsr); | ||
| 11412 | |||
| 11413 | #ifndef CONFIG_X86_64 | ||
| 11414 | /* | ||
| 11415 | * The sysexit path does not restore ds/es, so we must set them to | ||
| 11416 | * a reasonable value ourselves. | ||
| 11417 | * | ||
| 11418 | * We can't defer this to vmx_prepare_switch_to_host() since that | ||
| 11419 | * function may be executed in interrupt context, which saves and | ||
| 11420 | * restore segments around it, nullifying its effect. | ||
| 11421 | */ | ||
| 11422 | loadsegment(ds, __USER_DS); | ||
| 11423 | loadsegment(es, __USER_DS); | ||
| 11424 | #endif | ||
| 11425 | |||
| 11426 | vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP) | ||
| 11427 | | (1 << VCPU_EXREG_RFLAGS) | ||
| 11428 | | (1 << VCPU_EXREG_PDPTR) | ||
| 11429 | | (1 << VCPU_EXREG_SEGMENTS) | ||
| 11430 | | (1 << VCPU_EXREG_CR3)); | ||
| 11431 | vcpu->arch.regs_dirty = 0; | ||
| 11432 | |||
| 11433 | /* | ||
| 11434 | * eager fpu is enabled if PKEY is supported and CR4 is switched | ||
| 11435 | * back on host, so it is safe to read guest PKRU from current | ||
| 11436 | * XSAVE. | ||
| 11437 | */ | ||
| 11438 | if (static_cpu_has(X86_FEATURE_PKU) && | ||
| 11439 | kvm_read_cr4_bits(vcpu, X86_CR4_PKE)) { | ||
| 11440 | vcpu->arch.pkru = __read_pkru(); | ||
| 11441 | if (vcpu->arch.pkru != vmx->host_pkru) | ||
| 11442 | __write_pkru(vmx->host_pkru); | ||
| 11443 | } | ||
| 11444 | |||
| 11445 | vmx->nested.nested_run_pending = 0; | ||
| 11446 | vmx->idt_vectoring_info = 0; | ||
| 11447 | |||
| 11448 | vmx->exit_reason = vmx->fail ? 0xdead : vmcs_read32(VM_EXIT_REASON); | ||
| 11449 | if (vmx->fail || (vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) | ||
| 11450 | return; | ||
| 11451 | |||
| 11452 | vmx->loaded_vmcs->launched = 1; | ||
| 11453 | vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); | ||
| 11454 | |||
| 11455 | vmx_complete_atomic_exit(vmx); | ||
| 11456 | vmx_recover_nmi_blocking(vmx); | ||
| 11457 | vmx_complete_interrupts(vmx); | ||
| 11458 | } | ||
| 11459 | STACK_FRAME_NON_STANDARD(vmx_vcpu_run); | ||
| 11460 | |||
| 11461 | static struct kvm *vmx_vm_alloc(void) | ||
| 11462 | { | ||
| 11463 | struct kvm_vmx *kvm_vmx = vzalloc(sizeof(struct kvm_vmx)); | ||
| 11464 | return &kvm_vmx->kvm; | ||
| 11465 | } | ||
| 11466 | |||
| 11467 | static void vmx_vm_free(struct kvm *kvm) | ||
| 11468 | { | ||
| 11469 | vfree(to_kvm_vmx(kvm)); | ||
| 11470 | } | ||
| 11471 | |||
| 11472 | static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) | ||
| 11473 | { | ||
| 11474 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 11475 | int cpu; | ||
| 11476 | |||
| 11477 | if (vmx->loaded_vmcs == vmcs) | ||
| 11478 | return; | ||
| 11479 | |||
| 11480 | cpu = get_cpu(); | ||
| 11481 | vmx_vcpu_put(vcpu); | ||
| 11482 | vmx->loaded_vmcs = vmcs; | ||
| 11483 | vmx_vcpu_load(vcpu, cpu); | ||
| 11484 | put_cpu(); | ||
| 11485 | |||
| 11486 | vm_entry_controls_reset_shadow(vmx); | ||
| 11487 | vm_exit_controls_reset_shadow(vmx); | ||
| 11488 | vmx_segment_cache_clear(vmx); | ||
| 11489 | } | ||
| 11490 | |||
| 11491 | /* | ||
| 11492 | * Ensure that the current vmcs of the logical processor is the | ||
| 11493 | * vmcs01 of the vcpu before calling free_nested(). | ||
| 11494 | */ | ||
| 11495 | static void vmx_free_vcpu_nested(struct kvm_vcpu *vcpu) | ||
| 11496 | { | ||
| 11497 | vcpu_load(vcpu); | ||
| 11498 | vmx_switch_vmcs(vcpu, &to_vmx(vcpu)->vmcs01); | ||
| 11499 | free_nested(vcpu); | ||
| 11500 | vcpu_put(vcpu); | ||
| 11501 | } | ||
| 11502 | |||
| 11503 | static void vmx_free_vcpu(struct kvm_vcpu *vcpu) | ||
| 11504 | { | ||
| 11505 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 11506 | |||
| 11507 | if (enable_pml) | ||
| 11508 | vmx_destroy_pml_buffer(vmx); | ||
| 11509 | free_vpid(vmx->vpid); | ||
| 11510 | leave_guest_mode(vcpu); | ||
| 11511 | vmx_free_vcpu_nested(vcpu); | ||
| 11512 | free_loaded_vmcs(vmx->loaded_vmcs); | ||
| 11513 | kfree(vmx->guest_msrs); | ||
| 11514 | kvm_vcpu_uninit(vcpu); | ||
| 11515 | kmem_cache_free(kvm_vcpu_cache, vmx); | ||
| 11516 | } | ||
| 11517 | |||
| 11518 | static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) | ||
| 11519 | { | ||
| 11520 | int err; | ||
| 11521 | struct vcpu_vmx *vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); | ||
| 11522 | unsigned long *msr_bitmap; | ||
| 11523 | int cpu; | ||
| 11524 | |||
| 11525 | if (!vmx) | ||
| 11526 | return ERR_PTR(-ENOMEM); | ||
| 11527 | |||
| 11528 | vmx->vpid = allocate_vpid(); | ||
| 11529 | |||
| 11530 | err = kvm_vcpu_init(&vmx->vcpu, kvm, id); | ||
| 11531 | if (err) | ||
| 11532 | goto free_vcpu; | ||
| 11533 | |||
| 11534 | err = -ENOMEM; | ||
| 11535 | |||
| 11536 | /* | ||
| 11537 | * If PML is turned on, failure on enabling PML just results in failure | ||
| 11538 | * of creating the vcpu, therefore we can simplify PML logic (by | ||
| 11539 | * avoiding dealing with cases, such as enabling PML partially on vcpus | ||
| 11540 | * for the guest, etc. | ||
| 11541 | */ | ||
| 11542 | if (enable_pml) { | ||
| 11543 | vmx->pml_pg = alloc_page(GFP_KERNEL | __GFP_ZERO); | ||
| 11544 | if (!vmx->pml_pg) | ||
| 11545 | goto uninit_vcpu; | ||
| 11546 | } | ||
| 11547 | |||
| 11548 | vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL); | ||
| 11549 | BUILD_BUG_ON(ARRAY_SIZE(vmx_msr_index) * sizeof(vmx->guest_msrs[0]) | ||
| 11550 | > PAGE_SIZE); | ||
| 11551 | |||
| 11552 | if (!vmx->guest_msrs) | ||
| 11553 | goto free_pml; | ||
| 11554 | |||
| 11555 | err = alloc_loaded_vmcs(&vmx->vmcs01); | ||
| 11556 | if (err < 0) | ||
| 11557 | goto free_msrs; | ||
| 11558 | |||
| 11559 | msr_bitmap = vmx->vmcs01.msr_bitmap; | ||
| 11560 | vmx_disable_intercept_for_msr(msr_bitmap, MSR_FS_BASE, MSR_TYPE_RW); | ||
| 11561 | vmx_disable_intercept_for_msr(msr_bitmap, MSR_GS_BASE, MSR_TYPE_RW); | ||
| 11562 | vmx_disable_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW); | ||
| 11563 | vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW); | ||
| 11564 | vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW); | ||
| 11565 | vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW); | ||
| 11566 | vmx->msr_bitmap_mode = 0; | ||
| 11567 | |||
| 11568 | vmx->loaded_vmcs = &vmx->vmcs01; | ||
| 11569 | cpu = get_cpu(); | ||
| 11570 | vmx_vcpu_load(&vmx->vcpu, cpu); | ||
| 11571 | vmx->vcpu.cpu = cpu; | ||
| 11572 | vmx_vcpu_setup(vmx); | ||
| 11573 | vmx_vcpu_put(&vmx->vcpu); | ||
| 11574 | put_cpu(); | ||
| 11575 | if (cpu_need_virtualize_apic_accesses(&vmx->vcpu)) { | ||
| 11576 | err = alloc_apic_access_page(kvm); | ||
| 11577 | if (err) | ||
| 11578 | goto free_vmcs; | ||
| 11579 | } | ||
| 11580 | |||
| 11581 | if (enable_ept && !enable_unrestricted_guest) { | ||
| 11582 | err = init_rmode_identity_map(kvm); | ||
| 11583 | if (err) | ||
| 11584 | goto free_vmcs; | ||
| 11585 | } | ||
| 11586 | |||
| 11587 | if (nested) | ||
| 11588 | nested_vmx_setup_ctls_msrs(&vmx->nested.msrs, | ||
| 11589 | kvm_vcpu_apicv_active(&vmx->vcpu)); | ||
| 11590 | |||
| 11591 | vmx->nested.posted_intr_nv = -1; | ||
| 11592 | vmx->nested.current_vmptr = -1ull; | ||
| 11593 | |||
| 11594 | vmx->msr_ia32_feature_control_valid_bits = FEATURE_CONTROL_LOCKED; | ||
| 11595 | |||
| 11596 | /* | ||
| 11597 | * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR | ||
| 11598 | * or POSTED_INTR_WAKEUP_VECTOR. | ||
| 11599 | */ | ||
| 11600 | vmx->pi_desc.nv = POSTED_INTR_VECTOR; | ||
| 11601 | vmx->pi_desc.sn = 1; | ||
| 11602 | |||
| 11603 | return &vmx->vcpu; | ||
| 11604 | |||
| 11605 | free_vmcs: | ||
| 11606 | free_loaded_vmcs(vmx->loaded_vmcs); | ||
| 11607 | free_msrs: | ||
| 11608 | kfree(vmx->guest_msrs); | ||
| 11609 | free_pml: | ||
| 11610 | vmx_destroy_pml_buffer(vmx); | ||
| 11611 | uninit_vcpu: | ||
| 11612 | kvm_vcpu_uninit(&vmx->vcpu); | ||
| 11613 | free_vcpu: | ||
| 11614 | free_vpid(vmx->vpid); | ||
| 11615 | kmem_cache_free(kvm_vcpu_cache, vmx); | ||
| 11616 | return ERR_PTR(err); | ||
| 11617 | } | ||
| 11618 | |||
| 11619 | #define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html for details.\n" | ||
| 11620 | #define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html for details.\n" | ||
| 11621 | |||
| 11622 | static int vmx_vm_init(struct kvm *kvm) | ||
| 11623 | { | ||
| 11624 | spin_lock_init(&to_kvm_vmx(kvm)->ept_pointer_lock); | ||
| 11625 | |||
| 11626 | if (!ple_gap) | ||
| 11627 | kvm->arch.pause_in_guest = true; | ||
| 11628 | |||
| 11629 | if (boot_cpu_has(X86_BUG_L1TF) && enable_ept) { | ||
| 11630 | switch (l1tf_mitigation) { | ||
| 11631 | case L1TF_MITIGATION_OFF: | ||
| 11632 | case L1TF_MITIGATION_FLUSH_NOWARN: | ||
| 11633 | /* 'I explicitly don't care' is set */ | ||
| 11634 | break; | ||
| 11635 | case L1TF_MITIGATION_FLUSH: | ||
| 11636 | case L1TF_MITIGATION_FLUSH_NOSMT: | ||
| 11637 | case L1TF_MITIGATION_FULL: | ||
| 11638 | /* | ||
| 11639 | * Warn upon starting the first VM in a potentially | ||
| 11640 | * insecure environment. | ||
| 11641 | */ | ||
| 11642 | if (cpu_smt_control == CPU_SMT_ENABLED) | ||
| 11643 | pr_warn_once(L1TF_MSG_SMT); | ||
| 11644 | if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER) | ||
| 11645 | pr_warn_once(L1TF_MSG_L1D); | ||
| 11646 | break; | ||
| 11647 | case L1TF_MITIGATION_FULL_FORCE: | ||
| 11648 | /* Flush is enforced */ | ||
| 11649 | break; | ||
| 11650 | } | ||
| 11651 | } | ||
| 11652 | return 0; | ||
| 11653 | } | ||
| 11654 | |||
| 11655 | static void __init vmx_check_processor_compat(void *rtn) | ||
| 11656 | { | ||
| 11657 | struct vmcs_config vmcs_conf; | ||
| 11658 | |||
| 11659 | *(int *)rtn = 0; | ||
| 11660 | if (setup_vmcs_config(&vmcs_conf) < 0) | ||
| 11661 | *(int *)rtn = -EIO; | ||
| 11662 | nested_vmx_setup_ctls_msrs(&vmcs_conf.nested, enable_apicv); | ||
| 11663 | if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) { | ||
| 11664 | printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n", | ||
| 11665 | smp_processor_id()); | ||
| 11666 | *(int *)rtn = -EIO; | ||
| 11667 | } | ||
| 11668 | } | ||
| 11669 | |||
| 11670 | static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) | ||
| 11671 | { | ||
| 11672 | u8 cache; | ||
| 11673 | u64 ipat = 0; | ||
| 11674 | |||
| 11675 | /* For VT-d and EPT combination | ||
| 11676 | * 1. MMIO: always map as UC | ||
| 11677 | * 2. EPT with VT-d: | ||
| 11678 | * a. VT-d without snooping control feature: can't guarantee the | ||
| 11679 | * result, try to trust guest. | ||
| 11680 | * b. VT-d with snooping control feature: snooping control feature of | ||
| 11681 | * VT-d engine can guarantee the cache correctness. Just set it | ||
| 11682 | * to WB to keep consistent with host. So the same as item 3. | ||
| 11683 | * 3. EPT without VT-d: always map as WB and set IPAT=1 to keep | ||
| 11684 | * consistent with host MTRR | ||
| 11685 | */ | ||
| 11686 | if (is_mmio) { | ||
| 11687 | cache = MTRR_TYPE_UNCACHABLE; | ||
| 11688 | goto exit; | ||
| 11689 | } | ||
| 11690 | |||
| 11691 | if (!kvm_arch_has_noncoherent_dma(vcpu->kvm)) { | ||
| 11692 | ipat = VMX_EPT_IPAT_BIT; | ||
| 11693 | cache = MTRR_TYPE_WRBACK; | ||
| 11694 | goto exit; | ||
| 11695 | } | ||
| 11696 | |||
| 11697 | if (kvm_read_cr0(vcpu) & X86_CR0_CD) { | ||
| 11698 | ipat = VMX_EPT_IPAT_BIT; | ||
| 11699 | if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED)) | ||
| 11700 | cache = MTRR_TYPE_WRBACK; | ||
| 11701 | else | ||
| 11702 | cache = MTRR_TYPE_UNCACHABLE; | ||
| 11703 | goto exit; | ||
| 11704 | } | ||
| 11705 | |||
| 11706 | cache = kvm_mtrr_get_guest_memory_type(vcpu, gfn); | ||
| 11707 | |||
| 11708 | exit: | ||
| 11709 | return (cache << VMX_EPT_MT_EPTE_SHIFT) | ipat; | ||
| 11710 | } | ||
| 11711 | |||
| 11712 | static int vmx_get_lpage_level(void) | ||
| 11713 | { | ||
| 11714 | if (enable_ept && !cpu_has_vmx_ept_1g_page()) | ||
| 11715 | return PT_DIRECTORY_LEVEL; | ||
| 11716 | else | ||
| 11717 | /* For shadow and EPT supported 1GB page */ | ||
| 11718 | return PT_PDPE_LEVEL; | ||
| 11719 | } | ||
| 11720 | |||
| 11721 | static void vmcs_set_secondary_exec_control(u32 new_ctl) | ||
| 11722 | { | ||
| 11723 | /* | ||
| 11724 | * These bits in the secondary execution controls field | ||
| 11725 | * are dynamic, the others are mostly based on the hypervisor | ||
| 11726 | * architecture and the guest's CPUID. Do not touch the | ||
| 11727 | * dynamic bits. | ||
| 11728 | */ | ||
| 11729 | u32 mask = | ||
| 11730 | SECONDARY_EXEC_SHADOW_VMCS | | ||
| 11731 | SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | | ||
| 11732 | SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | | ||
| 11733 | SECONDARY_EXEC_DESC; | ||
| 11734 | |||
| 11735 | u32 cur_ctl = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); | ||
| 11736 | |||
| 11737 | vmcs_write32(SECONDARY_VM_EXEC_CONTROL, | ||
| 11738 | (new_ctl & ~mask) | (cur_ctl & mask)); | ||
| 11739 | } | ||
| 11740 | |||
| 11741 | /* | ||
| 11742 | * Generate MSR_IA32_VMX_CR{0,4}_FIXED1 according to CPUID. Only set bits | ||
| 11743 | * (indicating "allowed-1") if they are supported in the guest's CPUID. | ||
| 11744 | */ | ||
| 11745 | static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu) | ||
| 11746 | { | ||
| 11747 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 11748 | struct kvm_cpuid_entry2 *entry; | ||
| 11749 | |||
| 11750 | vmx->nested.msrs.cr0_fixed1 = 0xffffffff; | ||
| 11751 | vmx->nested.msrs.cr4_fixed1 = X86_CR4_PCE; | ||
| 11752 | |||
| 11753 | #define cr4_fixed1_update(_cr4_mask, _reg, _cpuid_mask) do { \ | ||
| 11754 | if (entry && (entry->_reg & (_cpuid_mask))) \ | ||
| 11755 | vmx->nested.msrs.cr4_fixed1 |= (_cr4_mask); \ | ||
| 11756 | } while (0) | ||
| 11757 | |||
| 11758 | entry = kvm_find_cpuid_entry(vcpu, 0x1, 0); | ||
| 11759 | cr4_fixed1_update(X86_CR4_VME, edx, bit(X86_FEATURE_VME)); | ||
| 11760 | cr4_fixed1_update(X86_CR4_PVI, edx, bit(X86_FEATURE_VME)); | ||
| 11761 | cr4_fixed1_update(X86_CR4_TSD, edx, bit(X86_FEATURE_TSC)); | ||
| 11762 | cr4_fixed1_update(X86_CR4_DE, edx, bit(X86_FEATURE_DE)); | ||
| 11763 | cr4_fixed1_update(X86_CR4_PSE, edx, bit(X86_FEATURE_PSE)); | ||
| 11764 | cr4_fixed1_update(X86_CR4_PAE, edx, bit(X86_FEATURE_PAE)); | ||
| 11765 | cr4_fixed1_update(X86_CR4_MCE, edx, bit(X86_FEATURE_MCE)); | ||
| 11766 | cr4_fixed1_update(X86_CR4_PGE, edx, bit(X86_FEATURE_PGE)); | ||
| 11767 | cr4_fixed1_update(X86_CR4_OSFXSR, edx, bit(X86_FEATURE_FXSR)); | ||
| 11768 | cr4_fixed1_update(X86_CR4_OSXMMEXCPT, edx, bit(X86_FEATURE_XMM)); | ||
| 11769 | cr4_fixed1_update(X86_CR4_VMXE, ecx, bit(X86_FEATURE_VMX)); | ||
| 11770 | cr4_fixed1_update(X86_CR4_SMXE, ecx, bit(X86_FEATURE_SMX)); | ||
| 11771 | cr4_fixed1_update(X86_CR4_PCIDE, ecx, bit(X86_FEATURE_PCID)); | ||
| 11772 | cr4_fixed1_update(X86_CR4_OSXSAVE, ecx, bit(X86_FEATURE_XSAVE)); | ||
| 11773 | |||
| 11774 | entry = kvm_find_cpuid_entry(vcpu, 0x7, 0); | ||
| 11775 | cr4_fixed1_update(X86_CR4_FSGSBASE, ebx, bit(X86_FEATURE_FSGSBASE)); | ||
| 11776 | cr4_fixed1_update(X86_CR4_SMEP, ebx, bit(X86_FEATURE_SMEP)); | ||
| 11777 | cr4_fixed1_update(X86_CR4_SMAP, ebx, bit(X86_FEATURE_SMAP)); | ||
| 11778 | cr4_fixed1_update(X86_CR4_PKE, ecx, bit(X86_FEATURE_PKU)); | ||
| 11779 | cr4_fixed1_update(X86_CR4_UMIP, ecx, bit(X86_FEATURE_UMIP)); | ||
| 11780 | |||
| 11781 | #undef cr4_fixed1_update | ||
| 11782 | } | ||
| 11783 | |||
| 11784 | static void nested_vmx_entry_exit_ctls_update(struct kvm_vcpu *vcpu) | ||
| 11785 | { | ||
| 11786 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 11787 | |||
| 11788 | if (kvm_mpx_supported()) { | ||
| 11789 | bool mpx_enabled = guest_cpuid_has(vcpu, X86_FEATURE_MPX); | ||
| 11790 | |||
| 11791 | if (mpx_enabled) { | ||
| 11792 | vmx->nested.msrs.entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS; | ||
| 11793 | vmx->nested.msrs.exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS; | ||
| 11794 | } else { | ||
| 11795 | vmx->nested.msrs.entry_ctls_high &= ~VM_ENTRY_LOAD_BNDCFGS; | ||
| 11796 | vmx->nested.msrs.exit_ctls_high &= ~VM_EXIT_CLEAR_BNDCFGS; | ||
| 11797 | } | ||
| 11798 | } | ||
| 11799 | } | ||
| 11800 | |||
| 11801 | static void vmx_cpuid_update(struct kvm_vcpu *vcpu) | ||
| 11802 | { | ||
| 11803 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 11804 | |||
| 11805 | if (cpu_has_secondary_exec_ctrls()) { | ||
| 11806 | vmx_compute_secondary_exec_control(vmx); | ||
| 11807 | vmcs_set_secondary_exec_control(vmx->secondary_exec_control); | ||
| 11808 | } | ||
| 11809 | |||
| 11810 | if (nested_vmx_allowed(vcpu)) | ||
| 11811 | to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |= | ||
| 11812 | FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; | ||
| 11813 | else | ||
| 11814 | to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &= | ||
| 11815 | ~FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; | ||
| 11816 | |||
| 11817 | if (nested_vmx_allowed(vcpu)) { | ||
| 11818 | nested_vmx_cr_fixed1_bits_update(vcpu); | ||
| 11819 | nested_vmx_entry_exit_ctls_update(vcpu); | ||
| 11820 | } | ||
| 11821 | } | ||
| 11822 | |||
| 11823 | static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) | ||
| 11824 | { | ||
| 11825 | if (func == 1 && nested) | ||
| 11826 | entry->ecx |= bit(X86_FEATURE_VMX); | ||
| 11827 | } | ||
| 11828 | |||
| 11829 | static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, | ||
| 11830 | struct x86_exception *fault) | ||
| 11831 | { | ||
| 11832 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); | ||
| 11833 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 11834 | u32 exit_reason; | ||
| 11835 | unsigned long exit_qualification = vcpu->arch.exit_qualification; | ||
| 11836 | |||
| 11837 | if (vmx->nested.pml_full) { | ||
| 11838 | exit_reason = EXIT_REASON_PML_FULL; | ||
| 11839 | vmx->nested.pml_full = false; | ||
| 11840 | exit_qualification &= INTR_INFO_UNBLOCK_NMI; | ||
| 11841 | } else if (fault->error_code & PFERR_RSVD_MASK) | ||
| 11842 | exit_reason = EXIT_REASON_EPT_MISCONFIG; | ||
| 11843 | else | ||
| 11844 | exit_reason = EXIT_REASON_EPT_VIOLATION; | ||
| 11845 | |||
| 11846 | nested_vmx_vmexit(vcpu, exit_reason, 0, exit_qualification); | ||
| 11847 | vmcs12->guest_physical_address = fault->address; | ||
| 11848 | } | ||
| 11849 | |||
| 11850 | static bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu) | ||
| 11851 | { | ||
| 11852 | return nested_ept_get_cr3(vcpu) & VMX_EPTP_AD_ENABLE_BIT; | ||
| 11853 | } | ||
| 11854 | |||
| 11855 | /* Callbacks for nested_ept_init_mmu_context: */ | ||
| 11856 | |||
| 11857 | static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu) | ||
| 11858 | { | ||
| 11859 | /* return the page table to be shadowed - in our case, EPT12 */ | ||
| 11860 | return get_vmcs12(vcpu)->ept_pointer; | ||
| 11861 | } | ||
| 11862 | |||
| 11863 | static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) | ||
| 11864 | { | ||
| 11865 | WARN_ON(mmu_is_nested(vcpu)); | ||
| 11866 | |||
| 11867 | vcpu->arch.mmu = &vcpu->arch.guest_mmu; | ||
| 11868 | kvm_init_shadow_ept_mmu(vcpu, | ||
| 11869 | to_vmx(vcpu)->nested.msrs.ept_caps & | ||
| 11870 | VMX_EPT_EXECUTE_ONLY_BIT, | ||
| 11871 | nested_ept_ad_enabled(vcpu), | ||
| 11872 | nested_ept_get_cr3(vcpu)); | ||
| 11873 | vcpu->arch.mmu->set_cr3 = vmx_set_cr3; | ||
| 11874 | vcpu->arch.mmu->get_cr3 = nested_ept_get_cr3; | ||
| 11875 | vcpu->arch.mmu->inject_page_fault = nested_ept_inject_page_fault; | ||
| 11876 | vcpu->arch.mmu->get_pdptr = kvm_pdptr_read; | ||
| 11877 | |||
| 11878 | vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; | ||
| 11879 | } | ||
| 11880 | |||
| 11881 | static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu) | ||
| 11882 | { | ||
| 11883 | vcpu->arch.mmu = &vcpu->arch.root_mmu; | ||
| 11884 | vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; | ||
| 11885 | } | ||
| 11886 | |||
| 11887 | static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12, | ||
| 11888 | u16 error_code) | ||
| 11889 | { | ||
| 11890 | bool inequality, bit; | ||
| 11891 | |||
| 11892 | bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0; | ||
| 11893 | inequality = | ||
| 11894 | (error_code & vmcs12->page_fault_error_code_mask) != | ||
| 11895 | vmcs12->page_fault_error_code_match; | ||
| 11896 | return inequality ^ bit; | ||
| 11897 | } | ||
| 11898 | |||
| 11899 | static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu, | ||
| 11900 | struct x86_exception *fault) | ||
| 11901 | { | ||
| 11902 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); | ||
| 11903 | |||
| 11904 | WARN_ON(!is_guest_mode(vcpu)); | ||
| 11905 | |||
| 11906 | if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code) && | ||
| 11907 | !to_vmx(vcpu)->nested.nested_run_pending) { | ||
| 11908 | vmcs12->vm_exit_intr_error_code = fault->error_code; | ||
| 11909 | nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, | ||
| 11910 | PF_VECTOR | INTR_TYPE_HARD_EXCEPTION | | ||
| 11911 | INTR_INFO_DELIVER_CODE_MASK | INTR_INFO_VALID_MASK, | ||
| 11912 | fault->address); | ||
| 11913 | } else { | ||
| 11914 | kvm_inject_page_fault(vcpu, fault); | ||
| 11915 | } | ||
| 11916 | } | ||
| 11917 | |||
| 11918 | static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, | ||
| 11919 | struct vmcs12 *vmcs12); | ||
| 11920 | |||
| 11921 | static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) | ||
| 11922 | { | ||
| 11923 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); | ||
| 11924 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 11925 | struct page *page; | ||
| 11926 | u64 hpa; | ||
| 11927 | |||
| 11928 | if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { | ||
| 11929 | /* | ||
| 11930 | * Translate L1 physical address to host physical | ||
| 11931 | * address for vmcs02. Keep the page pinned, so this | ||
| 11932 | * physical address remains valid. We keep a reference | ||
| 11933 | * to it so we can release it later. | ||
| 11934 | */ | ||
| 11935 | if (vmx->nested.apic_access_page) { /* shouldn't happen */ | ||
| 11936 | kvm_release_page_dirty(vmx->nested.apic_access_page); | ||
| 11937 | vmx->nested.apic_access_page = NULL; | ||
| 11938 | } | ||
| 11939 | page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->apic_access_addr); | ||
| 11940 | /* | ||
| 11941 | * If translation failed, no matter: This feature asks | ||
| 11942 | * to exit when accessing the given address, and if it | ||
| 11943 | * can never be accessed, this feature won't do | ||
| 11944 | * anything anyway. | ||
| 11945 | */ | ||
| 11946 | if (!is_error_page(page)) { | ||
| 11947 | vmx->nested.apic_access_page = page; | ||
| 11948 | hpa = page_to_phys(vmx->nested.apic_access_page); | ||
| 11949 | vmcs_write64(APIC_ACCESS_ADDR, hpa); | ||
| 11950 | } else { | ||
| 11951 | vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, | ||
| 11952 | SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES); | ||
| 11953 | } | ||
| 11954 | } | ||
| 11955 | |||
| 11956 | if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { | ||
| 11957 | if (vmx->nested.virtual_apic_page) { /* shouldn't happen */ | ||
| 11958 | kvm_release_page_dirty(vmx->nested.virtual_apic_page); | ||
| 11959 | vmx->nested.virtual_apic_page = NULL; | ||
| 11960 | } | ||
| 11961 | page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->virtual_apic_page_addr); | ||
| 11962 | |||
| 11963 | /* | ||
| 11964 | * If translation failed, VM entry will fail because | ||
| 11965 | * prepare_vmcs02 set VIRTUAL_APIC_PAGE_ADDR to -1ull. | ||
| 11966 | * Failing the vm entry is _not_ what the processor | ||
| 11967 | * does but it's basically the only possibility we | ||
| 11968 | * have. We could still enter the guest if CR8 load | ||
| 11969 | * exits are enabled, CR8 store exits are enabled, and | ||
| 11970 | * virtualize APIC access is disabled; in this case | ||
| 11971 | * the processor would never use the TPR shadow and we | ||
| 11972 | * could simply clear the bit from the execution | ||
| 11973 | * control. But such a configuration is useless, so | ||
| 11974 | * let's keep the code simple. | ||
| 11975 | */ | ||
| 11976 | if (!is_error_page(page)) { | ||
| 11977 | vmx->nested.virtual_apic_page = page; | ||
| 11978 | hpa = page_to_phys(vmx->nested.virtual_apic_page); | ||
| 11979 | vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, hpa); | ||
| 11980 | } | ||
| 11981 | } | ||
| 11982 | |||
| 11983 | if (nested_cpu_has_posted_intr(vmcs12)) { | ||
| 11984 | if (vmx->nested.pi_desc_page) { /* shouldn't happen */ | ||
| 11985 | kunmap(vmx->nested.pi_desc_page); | ||
| 11986 | kvm_release_page_dirty(vmx->nested.pi_desc_page); | ||
| 11987 | vmx->nested.pi_desc_page = NULL; | ||
| 11988 | vmx->nested.pi_desc = NULL; | ||
| 11989 | vmcs_write64(POSTED_INTR_DESC_ADDR, -1ull); | ||
| 11990 | } | ||
| 11991 | page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->posted_intr_desc_addr); | ||
| 11992 | if (is_error_page(page)) | ||
| 11993 | return; | ||
| 11994 | vmx->nested.pi_desc_page = page; | ||
| 11995 | vmx->nested.pi_desc = kmap(vmx->nested.pi_desc_page); | ||
| 11996 | vmx->nested.pi_desc = | ||
| 11997 | (struct pi_desc *)((void *)vmx->nested.pi_desc + | ||
| 11998 | (unsigned long)(vmcs12->posted_intr_desc_addr & | ||
| 11999 | (PAGE_SIZE - 1))); | ||
| 12000 | vmcs_write64(POSTED_INTR_DESC_ADDR, | ||
| 12001 | page_to_phys(vmx->nested.pi_desc_page) + | ||
| 12002 | (unsigned long)(vmcs12->posted_intr_desc_addr & | ||
| 12003 | (PAGE_SIZE - 1))); | ||
| 12004 | } | ||
| 12005 | if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12)) | ||
| 12006 | vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL, | ||
| 12007 | CPU_BASED_USE_MSR_BITMAPS); | ||
| 12008 | else | ||
| 12009 | vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL, | ||
| 12010 | CPU_BASED_USE_MSR_BITMAPS); | ||
| 12011 | } | ||
| 12012 | |||
| 12013 | static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu) | ||
| 12014 | { | ||
| 12015 | u64 preemption_timeout = get_vmcs12(vcpu)->vmx_preemption_timer_value; | ||
| 12016 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 12017 | |||
| 12018 | /* | ||
| 12019 | * A timer value of zero is architecturally guaranteed to cause | ||
| 12020 | * a VMExit prior to executing any instructions in the guest. | ||
| 12021 | */ | ||
| 12022 | if (preemption_timeout == 0) { | ||
| 12023 | vmx_preemption_timer_fn(&vmx->nested.preemption_timer); | ||
| 12024 | return; | ||
| 12025 | } | ||
| 12026 | |||
| 12027 | if (vcpu->arch.virtual_tsc_khz == 0) | ||
| 12028 | return; | ||
| 12029 | |||
| 12030 | preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; | ||
| 12031 | preemption_timeout *= 1000000; | ||
| 12032 | do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz); | ||
| 12033 | hrtimer_start(&vmx->nested.preemption_timer, | ||
| 12034 | ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL); | ||
| 12035 | } | ||
| 12036 | |||
| 12037 | static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu, | ||
| 12038 | struct vmcs12 *vmcs12) | ||
| 12039 | { | ||
| 12040 | if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) | ||
| 12041 | return 0; | ||
| 12042 | |||
| 12043 | if (!page_address_valid(vcpu, vmcs12->io_bitmap_a) || | ||
| 12044 | !page_address_valid(vcpu, vmcs12->io_bitmap_b)) | ||
| 12045 | return -EINVAL; | ||
| 12046 | |||
| 12047 | return 0; | ||
| 12048 | } | ||
| 12049 | |||
| 12050 | static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu, | ||
| 12051 | struct vmcs12 *vmcs12) | ||
| 12052 | { | ||
| 12053 | if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) | ||
| 12054 | return 0; | ||
| 12055 | |||
| 12056 | if (!page_address_valid(vcpu, vmcs12->msr_bitmap)) | ||
| 12057 | return -EINVAL; | ||
| 12058 | |||
| 12059 | return 0; | ||
| 12060 | } | ||
| 12061 | |||
| 12062 | static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu, | ||
| 12063 | struct vmcs12 *vmcs12) | ||
| 12064 | { | ||
| 12065 | if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) | ||
| 12066 | return 0; | ||
| 12067 | |||
| 12068 | if (!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr)) | ||
| 12069 | return -EINVAL; | ||
| 12070 | |||
| 12071 | return 0; | ||
| 12072 | } | ||
| 12073 | |||
| 12074 | /* | ||
| 12075 | * Merge L0's and L1's MSR bitmap, return false to indicate that | ||
| 12076 | * we do not use the hardware. | ||
| 12077 | */ | ||
| 12078 | static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, | ||
| 12079 | struct vmcs12 *vmcs12) | ||
| 12080 | { | ||
| 12081 | int msr; | ||
| 12082 | struct page *page; | ||
| 12083 | unsigned long *msr_bitmap_l1; | ||
| 12084 | unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap; | ||
| 12085 | /* | ||
| 12086 | * pred_cmd & spec_ctrl are trying to verify two things: | ||
| 12087 | * | ||
| 12088 | * 1. L0 gave a permission to L1 to actually passthrough the MSR. This | ||
| 12089 | * ensures that we do not accidentally generate an L02 MSR bitmap | ||
| 12090 | * from the L12 MSR bitmap that is too permissive. | ||
| 12091 | * 2. That L1 or L2s have actually used the MSR. This avoids | ||
| 12092 | * unnecessarily merging of the bitmap if the MSR is unused. This | ||
| 12093 | * works properly because we only update the L01 MSR bitmap lazily. | ||
| 12094 | * So even if L0 should pass L1 these MSRs, the L01 bitmap is only | ||
| 12095 | * updated to reflect this when L1 (or its L2s) actually write to | ||
| 12096 | * the MSR. | ||
| 12097 | */ | ||
| 12098 | bool pred_cmd = !msr_write_intercepted_l01(vcpu, MSR_IA32_PRED_CMD); | ||
| 12099 | bool spec_ctrl = !msr_write_intercepted_l01(vcpu, MSR_IA32_SPEC_CTRL); | ||
| 12100 | |||
| 12101 | /* Nothing to do if the MSR bitmap is not in use. */ | ||
| 12102 | if (!cpu_has_vmx_msr_bitmap() || | ||
| 12103 | !nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) | ||
| 12104 | return false; | ||
| 12105 | |||
| 12106 | if (!nested_cpu_has_virt_x2apic_mode(vmcs12) && | ||
| 12107 | !pred_cmd && !spec_ctrl) | ||
| 12108 | return false; | ||
| 12109 | |||
| 12110 | page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->msr_bitmap); | ||
| 12111 | if (is_error_page(page)) | ||
| 12112 | return false; | ||
| 12113 | |||
| 12114 | msr_bitmap_l1 = (unsigned long *)kmap(page); | ||
| 12115 | if (nested_cpu_has_apic_reg_virt(vmcs12)) { | ||
| 12116 | /* | ||
| 12117 | * L0 need not intercept reads for MSRs between 0x800 and 0x8ff, it | ||
| 12118 | * just lets the processor take the value from the virtual-APIC page; | ||
| 12119 | * take those 256 bits directly from the L1 bitmap. | ||
| 12120 | */ | ||
| 12121 | for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { | ||
| 12122 | unsigned word = msr / BITS_PER_LONG; | ||
| 12123 | msr_bitmap_l0[word] = msr_bitmap_l1[word]; | ||
| 12124 | msr_bitmap_l0[word + (0x800 / sizeof(long))] = ~0; | ||
| 12125 | } | ||
| 12126 | } else { | ||
| 12127 | for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { | ||
| 12128 | unsigned word = msr / BITS_PER_LONG; | ||
| 12129 | msr_bitmap_l0[word] = ~0; | ||
| 12130 | msr_bitmap_l0[word + (0x800 / sizeof(long))] = ~0; | ||
| 12131 | } | ||
| 12132 | } | ||
| 12133 | |||
| 12134 | nested_vmx_disable_intercept_for_msr( | ||
| 12135 | msr_bitmap_l1, msr_bitmap_l0, | ||
| 12136 | X2APIC_MSR(APIC_TASKPRI), | ||
| 12137 | MSR_TYPE_W); | ||
| 12138 | |||
| 12139 | if (nested_cpu_has_vid(vmcs12)) { | ||
| 12140 | nested_vmx_disable_intercept_for_msr( | ||
| 12141 | msr_bitmap_l1, msr_bitmap_l0, | ||
| 12142 | X2APIC_MSR(APIC_EOI), | ||
| 12143 | MSR_TYPE_W); | ||
| 12144 | nested_vmx_disable_intercept_for_msr( | ||
| 12145 | msr_bitmap_l1, msr_bitmap_l0, | ||
| 12146 | X2APIC_MSR(APIC_SELF_IPI), | ||
| 12147 | MSR_TYPE_W); | ||
| 12148 | } | ||
| 12149 | |||
| 12150 | if (spec_ctrl) | ||
| 12151 | nested_vmx_disable_intercept_for_msr( | ||
| 12152 | msr_bitmap_l1, msr_bitmap_l0, | ||
| 12153 | MSR_IA32_SPEC_CTRL, | ||
| 12154 | MSR_TYPE_R | MSR_TYPE_W); | ||
| 12155 | |||
| 12156 | if (pred_cmd) | ||
| 12157 | nested_vmx_disable_intercept_for_msr( | ||
| 12158 | msr_bitmap_l1, msr_bitmap_l0, | ||
| 12159 | MSR_IA32_PRED_CMD, | ||
| 12160 | MSR_TYPE_W); | ||
| 12161 | |||
| 12162 | kunmap(page); | ||
| 12163 | kvm_release_page_clean(page); | ||
| 12164 | |||
| 12165 | return true; | ||
| 12166 | } | ||
| 12167 | |||
| 12168 | static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu, | ||
| 12169 | struct vmcs12 *vmcs12) | ||
| 12170 | { | ||
| 12171 | struct vmcs12 *shadow; | ||
| 12172 | struct page *page; | ||
| 12173 | |||
| 12174 | if (!nested_cpu_has_shadow_vmcs(vmcs12) || | ||
| 12175 | vmcs12->vmcs_link_pointer == -1ull) | ||
| 12176 | return; | ||
| 12177 | |||
| 12178 | shadow = get_shadow_vmcs12(vcpu); | ||
| 12179 | page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->vmcs_link_pointer); | ||
| 12180 | |||
| 12181 | memcpy(shadow, kmap(page), VMCS12_SIZE); | ||
| 12182 | |||
| 12183 | kunmap(page); | ||
| 12184 | kvm_release_page_clean(page); | ||
| 12185 | } | ||
| 12186 | |||
| 12187 | static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu, | ||
| 12188 | struct vmcs12 *vmcs12) | ||
| 12189 | { | ||
| 12190 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 12191 | |||
| 12192 | if (!nested_cpu_has_shadow_vmcs(vmcs12) || | ||
| 12193 | vmcs12->vmcs_link_pointer == -1ull) | ||
| 12194 | return; | ||
| 12195 | |||
| 12196 | kvm_write_guest(vmx->vcpu.kvm, vmcs12->vmcs_link_pointer, | ||
| 12197 | get_shadow_vmcs12(vcpu), VMCS12_SIZE); | ||
| 12198 | } | ||
| 12199 | |||
| 12200 | static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu, | ||
| 12201 | struct vmcs12 *vmcs12) | ||
| 12202 | { | ||
| 12203 | if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) && | ||
| 12204 | !page_address_valid(vcpu, vmcs12->apic_access_addr)) | ||
| 12205 | return -EINVAL; | ||
| 12206 | else | ||
| 12207 | return 0; | ||
| 12208 | } | ||
| 12209 | |||
| 12210 | static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu, | ||
| 12211 | struct vmcs12 *vmcs12) | ||
| 12212 | { | ||
| 12213 | if (!nested_cpu_has_virt_x2apic_mode(vmcs12) && | ||
| 12214 | !nested_cpu_has_apic_reg_virt(vmcs12) && | ||
| 12215 | !nested_cpu_has_vid(vmcs12) && | ||
| 12216 | !nested_cpu_has_posted_intr(vmcs12)) | ||
| 12217 | return 0; | ||
| 12218 | |||
| 12219 | /* | ||
| 12220 | * If virtualize x2apic mode is enabled, | ||
| 12221 | * virtualize apic access must be disabled. | ||
| 12222 | */ | ||
| 12223 | if (nested_cpu_has_virt_x2apic_mode(vmcs12) && | ||
| 12224 | nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) | ||
| 12225 | return -EINVAL; | ||
| 12226 | |||
| 12227 | /* | ||
| 12228 | * If virtual interrupt delivery is enabled, | ||
| 12229 | * we must exit on external interrupts. | ||
| 12230 | */ | ||
| 12231 | if (nested_cpu_has_vid(vmcs12) && | ||
| 12232 | !nested_exit_on_intr(vcpu)) | ||
| 12233 | return -EINVAL; | ||
| 12234 | |||
| 12235 | /* | ||
| 12236 | * bits 15:8 should be zero in posted_intr_nv, | ||
| 12237 | * the descriptor address has been already checked | ||
| 12238 | * in nested_get_vmcs12_pages. | ||
| 12239 | * | ||
| 12240 | * bits 5:0 of posted_intr_desc_addr should be zero. | ||
| 12241 | */ | ||
| 12242 | if (nested_cpu_has_posted_intr(vmcs12) && | ||
| 12243 | (!nested_cpu_has_vid(vmcs12) || | ||
| 12244 | !nested_exit_intr_ack_set(vcpu) || | ||
| 12245 | (vmcs12->posted_intr_nv & 0xff00) || | ||
| 12246 | (vmcs12->posted_intr_desc_addr & 0x3f) || | ||
| 12247 | (vmcs12->posted_intr_desc_addr >> cpuid_maxphyaddr(vcpu)))) | ||
| 12248 | return -EINVAL; | ||
| 12249 | |||
| 12250 | /* tpr shadow is needed by all apicv features. */ | ||
| 12251 | if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) | ||
| 12252 | return -EINVAL; | ||
| 12253 | |||
| 12254 | return 0; | ||
| 12255 | } | ||
| 12256 | |||
| 12257 | static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu, | ||
| 12258 | unsigned long count_field, | ||
| 12259 | unsigned long addr_field) | ||
| 12260 | { | ||
| 12261 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); | ||
| 12262 | int maxphyaddr; | ||
| 12263 | u64 count, addr; | ||
| 12264 | |||
| 12265 | if (vmcs12_read_any(vmcs12, count_field, &count) || | ||
| 12266 | vmcs12_read_any(vmcs12, addr_field, &addr)) { | ||
| 12267 | WARN_ON(1); | ||
| 12268 | return -EINVAL; | ||
| 12269 | } | ||
| 12270 | if (count == 0) | ||
| 12271 | return 0; | ||
| 12272 | maxphyaddr = cpuid_maxphyaddr(vcpu); | ||
| 12273 | if (!IS_ALIGNED(addr, 16) || addr >> maxphyaddr || | ||
| 12274 | (addr + count * sizeof(struct vmx_msr_entry) - 1) >> maxphyaddr) { | ||
| 12275 | pr_debug_ratelimited( | ||
| 12276 | "nVMX: invalid MSR switch (0x%lx, %d, %llu, 0x%08llx)", | ||
| 12277 | addr_field, maxphyaddr, count, addr); | ||
| 12278 | return -EINVAL; | ||
| 12279 | } | ||
| 12280 | return 0; | ||
| 12281 | } | ||
| 12282 | |||
| 12283 | static int nested_vmx_check_msr_switch_controls(struct kvm_vcpu *vcpu, | ||
| 12284 | struct vmcs12 *vmcs12) | ||
| 12285 | { | ||
| 12286 | if (vmcs12->vm_exit_msr_load_count == 0 && | ||
| 12287 | vmcs12->vm_exit_msr_store_count == 0 && | ||
| 12288 | vmcs12->vm_entry_msr_load_count == 0) | ||
| 12289 | return 0; /* Fast path */ | ||
| 12290 | if (nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_LOAD_COUNT, | ||
| 12291 | VM_EXIT_MSR_LOAD_ADDR) || | ||
| 12292 | nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_STORE_COUNT, | ||
| 12293 | VM_EXIT_MSR_STORE_ADDR) || | ||
| 12294 | nested_vmx_check_msr_switch(vcpu, VM_ENTRY_MSR_LOAD_COUNT, | ||
| 12295 | VM_ENTRY_MSR_LOAD_ADDR)) | ||
| 12296 | return -EINVAL; | ||
| 12297 | return 0; | ||
| 12298 | } | ||
| 12299 | |||
| 12300 | static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu, | ||
| 12301 | struct vmcs12 *vmcs12) | ||
| 12302 | { | ||
| 12303 | if (!nested_cpu_has_pml(vmcs12)) | ||
| 12304 | return 0; | ||
| 12305 | |||
| 12306 | if (!nested_cpu_has_ept(vmcs12) || | ||
| 12307 | !page_address_valid(vcpu, vmcs12->pml_address)) | ||
| 12308 | return -EINVAL; | ||
| 12309 | |||
| 12310 | return 0; | ||
| 12311 | } | ||
| 12312 | |||
| 12313 | static int nested_vmx_check_shadow_vmcs_controls(struct kvm_vcpu *vcpu, | ||
| 12314 | struct vmcs12 *vmcs12) | ||
| 12315 | { | ||
| 12316 | if (!nested_cpu_has_shadow_vmcs(vmcs12)) | ||
| 12317 | return 0; | ||
| 12318 | |||
| 12319 | if (!page_address_valid(vcpu, vmcs12->vmread_bitmap) || | ||
| 12320 | !page_address_valid(vcpu, vmcs12->vmwrite_bitmap)) | ||
| 12321 | return -EINVAL; | ||
| 12322 | |||
| 12323 | return 0; | ||
| 12324 | } | ||
| 12325 | |||
| 12326 | static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu, | ||
| 12327 | struct vmx_msr_entry *e) | ||
| 12328 | { | ||
| 12329 | /* x2APIC MSR accesses are not allowed */ | ||
| 12330 | if (vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8) | ||
| 12331 | return -EINVAL; | ||
| 12332 | if (e->index == MSR_IA32_UCODE_WRITE || /* SDM Table 35-2 */ | ||
| 12333 | e->index == MSR_IA32_UCODE_REV) | ||
| 12334 | return -EINVAL; | ||
| 12335 | if (e->reserved != 0) | ||
| 12336 | return -EINVAL; | ||
| 12337 | return 0; | ||
| 12338 | } | ||
| 12339 | |||
| 12340 | static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu, | ||
| 12341 | struct vmx_msr_entry *e) | ||
| 12342 | { | ||
| 12343 | if (e->index == MSR_FS_BASE || | ||
| 12344 | e->index == MSR_GS_BASE || | ||
| 12345 | e->index == MSR_IA32_SMM_MONITOR_CTL || /* SMM is not supported */ | ||
| 12346 | nested_vmx_msr_check_common(vcpu, e)) | ||
| 12347 | return -EINVAL; | ||
| 12348 | return 0; | ||
| 12349 | } | ||
| 12350 | |||
| 12351 | static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu, | ||
| 12352 | struct vmx_msr_entry *e) | ||
| 12353 | { | ||
| 12354 | if (e->index == MSR_IA32_SMBASE || /* SMM is not supported */ | ||
| 12355 | nested_vmx_msr_check_common(vcpu, e)) | ||
| 12356 | return -EINVAL; | ||
| 12357 | return 0; | ||
| 12358 | } | ||
| 12359 | |||
| 12360 | /* | ||
| 12361 | * Load guest's/host's msr at nested entry/exit. | ||
| 12362 | * return 0 for success, entry index for failure. | ||
| 12363 | */ | ||
| 12364 | static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) | ||
| 12365 | { | ||
| 12366 | u32 i; | ||
| 12367 | struct vmx_msr_entry e; | ||
| 12368 | struct msr_data msr; | ||
| 12369 | |||
| 12370 | msr.host_initiated = false; | ||
| 12371 | for (i = 0; i < count; i++) { | ||
| 12372 | if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e), | ||
| 12373 | &e, sizeof(e))) { | ||
| 12374 | pr_debug_ratelimited( | ||
| 12375 | "%s cannot read MSR entry (%u, 0x%08llx)\n", | ||
| 12376 | __func__, i, gpa + i * sizeof(e)); | ||
| 12377 | goto fail; | ||
| 12378 | } | ||
| 12379 | if (nested_vmx_load_msr_check(vcpu, &e)) { | ||
| 12380 | pr_debug_ratelimited( | ||
| 12381 | "%s check failed (%u, 0x%x, 0x%x)\n", | ||
| 12382 | __func__, i, e.index, e.reserved); | ||
| 12383 | goto fail; | ||
| 12384 | } | ||
| 12385 | msr.index = e.index; | ||
| 12386 | msr.data = e.value; | ||
| 12387 | if (kvm_set_msr(vcpu, &msr)) { | ||
| 12388 | pr_debug_ratelimited( | ||
| 12389 | "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", | ||
| 12390 | __func__, i, e.index, e.value); | ||
| 12391 | goto fail; | ||
| 12392 | } | ||
| 12393 | } | ||
| 12394 | return 0; | ||
| 12395 | fail: | ||
| 12396 | return i + 1; | ||
| 12397 | } | ||
| 12398 | |||
| 12399 | static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) | ||
| 12400 | { | ||
| 12401 | u32 i; | ||
| 12402 | struct vmx_msr_entry e; | ||
| 12403 | |||
| 12404 | for (i = 0; i < count; i++) { | ||
| 12405 | struct msr_data msr_info; | ||
| 12406 | if (kvm_vcpu_read_guest(vcpu, | ||
| 12407 | gpa + i * sizeof(e), | ||
| 12408 | &e, 2 * sizeof(u32))) { | ||
| 12409 | pr_debug_ratelimited( | ||
| 12410 | "%s cannot read MSR entry (%u, 0x%08llx)\n", | ||
| 12411 | __func__, i, gpa + i * sizeof(e)); | ||
| 12412 | return -EINVAL; | ||
| 12413 | } | ||
| 12414 | if (nested_vmx_store_msr_check(vcpu, &e)) { | ||
| 12415 | pr_debug_ratelimited( | ||
| 12416 | "%s check failed (%u, 0x%x, 0x%x)\n", | ||
| 12417 | __func__, i, e.index, e.reserved); | ||
| 12418 | return -EINVAL; | ||
| 12419 | } | ||
| 12420 | msr_info.host_initiated = false; | ||
| 12421 | msr_info.index = e.index; | ||
| 12422 | if (kvm_get_msr(vcpu, &msr_info)) { | ||
| 12423 | pr_debug_ratelimited( | ||
| 12424 | "%s cannot read MSR (%u, 0x%x)\n", | ||
| 12425 | __func__, i, e.index); | ||
| 12426 | return -EINVAL; | ||
| 12427 | } | ||
| 12428 | if (kvm_vcpu_write_guest(vcpu, | ||
| 12429 | gpa + i * sizeof(e) + | ||
| 12430 | offsetof(struct vmx_msr_entry, value), | ||
| 12431 | &msr_info.data, sizeof(msr_info.data))) { | ||
| 12432 | pr_debug_ratelimited( | ||
| 12433 | "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", | ||
| 12434 | __func__, i, e.index, msr_info.data); | ||
| 12435 | return -EINVAL; | ||
| 12436 | } | ||
| 12437 | } | ||
| 12438 | return 0; | ||
| 12439 | } | ||
| 12440 | |||
| 12441 | static bool nested_cr3_valid(struct kvm_vcpu *vcpu, unsigned long val) | ||
| 12442 | { | ||
| 12443 | unsigned long invalid_mask; | ||
| 12444 | |||
| 12445 | invalid_mask = (~0ULL) << cpuid_maxphyaddr(vcpu); | ||
| 12446 | return (val & invalid_mask) == 0; | ||
| 12447 | } | ||
| 12448 | |||
| 12449 | /* | ||
| 12450 | * Load guest's/host's cr3 at nested entry/exit. nested_ept is true if we are | ||
| 12451 | * emulating VM entry into a guest with EPT enabled. | ||
| 12452 | * Returns 0 on success, 1 on failure. Invalid state exit qualification code | ||
| 12453 | * is assigned to entry_failure_code on failure. | ||
| 12454 | */ | ||
| 12455 | static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool nested_ept, | ||
| 12456 | u32 *entry_failure_code) | ||
| 12457 | { | ||
| 12458 | if (cr3 != kvm_read_cr3(vcpu) || (!nested_ept && pdptrs_changed(vcpu))) { | ||
| 12459 | if (!nested_cr3_valid(vcpu, cr3)) { | ||
| 12460 | *entry_failure_code = ENTRY_FAIL_DEFAULT; | ||
| 12461 | return 1; | ||
| 12462 | } | ||
| 12463 | |||
| 12464 | /* | ||
| 12465 | * If PAE paging and EPT are both on, CR3 is not used by the CPU and | ||
| 12466 | * must not be dereferenced. | ||
| 12467 | */ | ||
| 12468 | if (!is_long_mode(vcpu) && is_pae(vcpu) && is_paging(vcpu) && | ||
| 12469 | !nested_ept) { | ||
| 12470 | if (!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) { | ||
| 12471 | *entry_failure_code = ENTRY_FAIL_PDPTE; | ||
| 12472 | return 1; | ||
| 12473 | } | ||
| 12474 | } | ||
| 12475 | } | ||
| 12476 | |||
| 12477 | if (!nested_ept) | ||
| 12478 | kvm_mmu_new_cr3(vcpu, cr3, false); | ||
| 12479 | |||
| 12480 | vcpu->arch.cr3 = cr3; | ||
| 12481 | __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); | ||
| 12482 | |||
| 12483 | kvm_init_mmu(vcpu, false); | ||
| 12484 | |||
| 12485 | return 0; | ||
| 12486 | } | ||
| 12487 | |||
| 12488 | /* | ||
| 12489 | * Returns if KVM is able to config CPU to tag TLB entries | ||
| 12490 | * populated by L2 differently than TLB entries populated | ||
| 12491 | * by L1. | ||
| 12492 | * | ||
| 12493 | * If L1 uses EPT, then TLB entries are tagged with different EPTP. | ||
| 12494 | * | ||
| 12495 | * If L1 uses VPID and we allocated a vpid02, TLB entries are tagged | ||
| 12496 | * with different VPID (L1 entries are tagged with vmx->vpid | ||
| 12497 | * while L2 entries are tagged with vmx->nested.vpid02). | ||
| 12498 | */ | ||
| 12499 | static bool nested_has_guest_tlb_tag(struct kvm_vcpu *vcpu) | ||
| 12500 | { | ||
| 12501 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); | ||
| 12502 | |||
| 12503 | return nested_cpu_has_ept(vmcs12) || | ||
| 12504 | (nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02); | ||
| 12505 | } | ||
| 12506 | |||
| 12507 | static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) | ||
| 12508 | { | ||
| 12509 | if (vmx->nested.nested_run_pending && | ||
| 12510 | (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) | ||
| 12511 | return vmcs12->guest_ia32_efer; | ||
| 12512 | else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) | ||
| 12513 | return vmx->vcpu.arch.efer | (EFER_LMA | EFER_LME); | ||
| 12514 | else | ||
| 12515 | return vmx->vcpu.arch.efer & ~(EFER_LMA | EFER_LME); | ||
| 12516 | } | ||
| 12517 | |||
| 12518 | static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx) | ||
| 12519 | { | ||
| 12520 | /* | ||
| 12521 | * If vmcs02 hasn't been initialized, set the constant vmcs02 state | ||
| 12522 | * according to L0's settings (vmcs12 is irrelevant here). Host | ||
| 12523 | * fields that come from L0 and are not constant, e.g. HOST_CR3, | ||
| 12524 | * will be set as needed prior to VMLAUNCH/VMRESUME. | ||
| 12525 | */ | ||
| 12526 | if (vmx->nested.vmcs02_initialized) | ||
| 12527 | return; | ||
| 12528 | vmx->nested.vmcs02_initialized = true; | ||
| 12529 | |||
| 12530 | /* | ||
| 12531 | * We don't care what the EPTP value is we just need to guarantee | ||
| 12532 | * it's valid so we don't get a false positive when doing early | ||
| 12533 | * consistency checks. | ||
| 12534 | */ | ||
| 12535 | if (enable_ept && nested_early_check) | ||
| 12536 | vmcs_write64(EPT_POINTER, construct_eptp(&vmx->vcpu, 0)); | ||
| 12537 | |||
| 12538 | /* All VMFUNCs are currently emulated through L0 vmexits. */ | ||
| 12539 | if (cpu_has_vmx_vmfunc()) | ||
| 12540 | vmcs_write64(VM_FUNCTION_CONTROL, 0); | ||
| 12541 | |||
| 12542 | if (cpu_has_vmx_posted_intr()) | ||
| 12543 | vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR); | ||
| 12544 | |||
| 12545 | if (cpu_has_vmx_msr_bitmap()) | ||
| 12546 | vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap)); | ||
| 12547 | |||
| 12548 | if (enable_pml) | ||
| 12549 | vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); | ||
| 12550 | |||
| 12551 | /* | ||
| 12552 | * Set the MSR load/store lists to match L0's settings. Only the | ||
| 12553 | * addresses are constant (for vmcs02), the counts can change based | ||
| 12554 | * on L2's behavior, e.g. switching to/from long mode. | ||
| 12555 | */ | ||
| 12556 | vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); | ||
| 12557 | vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val)); | ||
| 12558 | vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val)); | ||
| 12559 | |||
| 12560 | vmx_set_constant_host_state(vmx); | ||
| 12561 | } | ||
| 12562 | |||
| 12563 | static void prepare_vmcs02_early_full(struct vcpu_vmx *vmx, | ||
| 12564 | struct vmcs12 *vmcs12) | ||
| 12565 | { | ||
| 12566 | prepare_vmcs02_constant_state(vmx); | ||
| 12567 | |||
| 12568 | vmcs_write64(VMCS_LINK_POINTER, -1ull); | ||
| 12569 | |||
| 12570 | if (enable_vpid) { | ||
| 12571 | if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) | ||
| 12572 | vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02); | ||
| 12573 | else | ||
| 12574 | vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); | ||
| 12575 | } | ||
| 12576 | } | ||
| 12577 | |||
| 12578 | static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) | ||
| 12579 | { | ||
| 12580 | u32 exec_control, vmcs12_exec_ctrl; | ||
| 12581 | u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12); | ||
| 12582 | |||
| 12583 | if (vmx->nested.dirty_vmcs12 || vmx->nested.hv_evmcs) | ||
| 12584 | prepare_vmcs02_early_full(vmx, vmcs12); | ||
| 12585 | |||
| 12586 | /* | ||
| 12587 | * HOST_RSP is normally set correctly in vmx_vcpu_run() just before | ||
| 12588 | * entry, but only if the current (host) sp changed from the value | ||
| 12589 | * we wrote last (vmx->host_rsp). This cache is no longer relevant | ||
| 12590 | * if we switch vmcs, and rather than hold a separate cache per vmcs, | ||
| 12591 | * here we just force the write to happen on entry. host_rsp will | ||
| 12592 | * also be written unconditionally by nested_vmx_check_vmentry_hw() | ||
| 12593 | * if we are doing early consistency checks via hardware. | ||
| 12594 | */ | ||
| 12595 | vmx->host_rsp = 0; | ||
| 12596 | |||
| 12597 | /* | ||
| 12598 | * PIN CONTROLS | ||
| 12599 | */ | ||
| 12600 | exec_control = vmcs12->pin_based_vm_exec_control; | ||
| 12601 | |||
| 12602 | /* Preemption timer setting is computed directly in vmx_vcpu_run. */ | ||
| 12603 | exec_control |= vmcs_config.pin_based_exec_ctrl; | ||
| 12604 | exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; | ||
| 12605 | vmx->loaded_vmcs->hv_timer_armed = false; | ||
| 12606 | |||
| 12607 | /* Posted interrupts setting is only taken from vmcs12. */ | ||
| 12608 | if (nested_cpu_has_posted_intr(vmcs12)) { | ||
| 12609 | vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv; | ||
| 12610 | vmx->nested.pi_pending = false; | ||
| 12611 | } else { | ||
| 12612 | exec_control &= ~PIN_BASED_POSTED_INTR; | ||
| 12613 | } | ||
| 12614 | vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, exec_control); | ||
| 12615 | |||
| 12616 | /* | ||
| 12617 | * EXEC CONTROLS | ||
| 12618 | */ | ||
| 12619 | exec_control = vmx_exec_control(vmx); /* L0's desires */ | ||
| 12620 | exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING; | ||
| 12621 | exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING; | ||
| 12622 | exec_control &= ~CPU_BASED_TPR_SHADOW; | ||
| 12623 | exec_control |= vmcs12->cpu_based_vm_exec_control; | ||
| 12624 | |||
| 12625 | /* | ||
| 12626 | * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR. Later, if | ||
| 12627 | * nested_get_vmcs12_pages can't fix it up, the illegal value | ||
| 12628 | * will result in a VM entry failure. | ||
| 12629 | */ | ||
| 12630 | if (exec_control & CPU_BASED_TPR_SHADOW) { | ||
| 12631 | vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull); | ||
| 12632 | vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold); | ||
| 12633 | } else { | ||
| 12634 | #ifdef CONFIG_X86_64 | ||
| 12635 | exec_control |= CPU_BASED_CR8_LOAD_EXITING | | ||
| 12636 | CPU_BASED_CR8_STORE_EXITING; | ||
| 12637 | #endif | ||
| 12638 | } | ||
| 12639 | |||
| 12640 | /* | ||
| 12641 | * A vmexit (to either L1 hypervisor or L0 userspace) is always needed | ||
| 12642 | * for I/O port accesses. | ||
| 12643 | */ | ||
| 12644 | exec_control &= ~CPU_BASED_USE_IO_BITMAPS; | ||
| 12645 | exec_control |= CPU_BASED_UNCOND_IO_EXITING; | ||
| 12646 | vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control); | ||
| 12647 | |||
| 12648 | /* | ||
| 12649 | * SECONDARY EXEC CONTROLS | ||
| 12650 | */ | ||
| 12651 | if (cpu_has_secondary_exec_ctrls()) { | ||
| 12652 | exec_control = vmx->secondary_exec_control; | ||
| 12653 | |||
| 12654 | /* Take the following fields only from vmcs12 */ | ||
| 12655 | exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | | ||
| 12656 | SECONDARY_EXEC_ENABLE_INVPCID | | ||
| 12657 | SECONDARY_EXEC_RDTSCP | | ||
| 12658 | SECONDARY_EXEC_XSAVES | | ||
| 12659 | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | | ||
| 12660 | SECONDARY_EXEC_APIC_REGISTER_VIRT | | ||
| 12661 | SECONDARY_EXEC_ENABLE_VMFUNC); | ||
| 12662 | if (nested_cpu_has(vmcs12, | ||
| 12663 | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) { | ||
| 12664 | vmcs12_exec_ctrl = vmcs12->secondary_vm_exec_control & | ||
| 12665 | ~SECONDARY_EXEC_ENABLE_PML; | ||
| 12666 | exec_control |= vmcs12_exec_ctrl; | ||
| 12667 | } | ||
| 12668 | |||
| 12669 | /* VMCS shadowing for L2 is emulated for now */ | ||
| 12670 | exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; | ||
| 12671 | |||
| 12672 | if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) | ||
| 12673 | vmcs_write16(GUEST_INTR_STATUS, | ||
| 12674 | vmcs12->guest_intr_status); | ||
| 12675 | |||
| 12676 | /* | ||
| 12677 | * Write an illegal value to APIC_ACCESS_ADDR. Later, | ||
| 12678 | * nested_get_vmcs12_pages will either fix it up or | ||
| 12679 | * remove the VM execution control. | ||
| 12680 | */ | ||
| 12681 | if (exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) | ||
| 12682 | vmcs_write64(APIC_ACCESS_ADDR, -1ull); | ||
| 12683 | |||
| 12684 | if (exec_control & SECONDARY_EXEC_ENCLS_EXITING) | ||
| 12685 | vmcs_write64(ENCLS_EXITING_BITMAP, -1ull); | ||
| 12686 | |||
| 12687 | vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); | ||
| 12688 | } | ||
| 12689 | |||
| 12690 | /* | ||
| 12691 | * ENTRY CONTROLS | ||
| 12692 | * | ||
| 12693 | * vmcs12's VM_{ENTRY,EXIT}_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE | ||
| 12694 | * are emulated by vmx_set_efer() in prepare_vmcs02(), but speculate | ||
| 12695 | * on the related bits (if supported by the CPU) in the hope that | ||
| 12696 | * we can avoid VMWrites during vmx_set_efer(). | ||
| 12697 | */ | ||
| 12698 | exec_control = (vmcs12->vm_entry_controls | vmcs_config.vmentry_ctrl) & | ||
| 12699 | ~VM_ENTRY_IA32E_MODE & ~VM_ENTRY_LOAD_IA32_EFER; | ||
| 12700 | if (cpu_has_load_ia32_efer) { | ||
| 12701 | if (guest_efer & EFER_LMA) | ||
| 12702 | exec_control |= VM_ENTRY_IA32E_MODE; | ||
| 12703 | if (guest_efer != host_efer) | ||
| 12704 | exec_control |= VM_ENTRY_LOAD_IA32_EFER; | ||
| 12705 | } | ||
| 12706 | vm_entry_controls_init(vmx, exec_control); | ||
| 12707 | |||
| 12708 | /* | ||
| 12709 | * EXIT CONTROLS | ||
| 12710 | * | ||
| 12711 | * L2->L1 exit controls are emulated - the hardware exit is to L0 so | ||
| 12712 | * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER | ||
| 12713 | * bits may be modified by vmx_set_efer() in prepare_vmcs02(). | ||
| 12714 | */ | ||
| 12715 | exec_control = vmcs_config.vmexit_ctrl; | ||
| 12716 | if (cpu_has_load_ia32_efer && guest_efer != host_efer) | ||
| 12717 | exec_control |= VM_EXIT_LOAD_IA32_EFER; | ||
| 12718 | vm_exit_controls_init(vmx, exec_control); | ||
| 12719 | |||
| 12720 | /* | ||
| 12721 | * Conceptually we want to copy the PML address and index from | ||
| 12722 | * vmcs01 here, and then back to vmcs01 on nested vmexit. But, | ||
| 12723 | * since we always flush the log on each vmexit and never change | ||
| 12724 | * the PML address (once set), this happens to be equivalent to | ||
| 12725 | * simply resetting the index in vmcs02. | ||
| 12726 | */ | ||
| 12727 | if (enable_pml) | ||
| 12728 | vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); | ||
| 12729 | |||
| 12730 | /* | ||
| 12731 | * Interrupt/Exception Fields | ||
| 12732 | */ | ||
| 12733 | if (vmx->nested.nested_run_pending) { | ||
| 12734 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, | ||
| 12735 | vmcs12->vm_entry_intr_info_field); | ||
| 12736 | vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, | ||
| 12737 | vmcs12->vm_entry_exception_error_code); | ||
| 12738 | vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, | ||
| 12739 | vmcs12->vm_entry_instruction_len); | ||
| 12740 | vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, | ||
| 12741 | vmcs12->guest_interruptibility_info); | ||
| 12742 | vmx->loaded_vmcs->nmi_known_unmasked = | ||
| 12743 | !(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI); | ||
| 12744 | } else { | ||
| 12745 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); | ||
| 12746 | } | ||
| 12747 | } | ||
| 12748 | |||
| 12749 | static void prepare_vmcs02_full(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) | ||
| 12750 | { | ||
| 12751 | struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs; | ||
| 12752 | |||
| 12753 | if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & | ||
| 12754 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) { | ||
| 12755 | vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector); | ||
| 12756 | vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector); | ||
| 12757 | vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector); | ||
| 12758 | vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector); | ||
| 12759 | vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector); | ||
| 12760 | vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector); | ||
| 12761 | vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector); | ||
| 12762 | vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector); | ||
| 12763 | vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit); | ||
| 12764 | vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit); | ||
| 12765 | vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit); | ||
| 12766 | vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit); | ||
| 12767 | vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit); | ||
| 12768 | vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit); | ||
| 12769 | vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit); | ||
| 12770 | vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit); | ||
| 12771 | vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit); | ||
| 12772 | vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit); | ||
| 12773 | vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes); | ||
| 12774 | vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes); | ||
| 12775 | vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes); | ||
| 12776 | vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes); | ||
| 12777 | vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes); | ||
| 12778 | vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes); | ||
| 12779 | vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base); | ||
| 12780 | vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base); | ||
| 12781 | vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base); | ||
| 12782 | vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base); | ||
| 12783 | vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base); | ||
| 12784 | vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base); | ||
| 12785 | vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base); | ||
| 12786 | vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base); | ||
| 12787 | vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base); | ||
| 12788 | vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base); | ||
| 12789 | } | ||
| 12790 | |||
| 12791 | if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & | ||
| 12792 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1)) { | ||
| 12793 | vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs); | ||
| 12794 | vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, | ||
| 12795 | vmcs12->guest_pending_dbg_exceptions); | ||
| 12796 | vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp); | ||
| 12797 | vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip); | ||
| 12798 | |||
| 12799 | /* | ||
| 12800 | * L1 may access the L2's PDPTR, so save them to construct | ||
| 12801 | * vmcs12 | ||
| 12802 | */ | ||
| 12803 | if (enable_ept) { | ||
| 12804 | vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); | ||
| 12805 | vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); | ||
| 12806 | vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); | ||
| 12807 | vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); | ||
| 12808 | } | ||
| 12809 | } | ||
| 12810 | |||
| 12811 | if (nested_cpu_has_xsaves(vmcs12)) | ||
| 12812 | vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap); | ||
| 12813 | |||
| 12814 | /* | ||
| 12815 | * Whether page-faults are trapped is determined by a combination of | ||
| 12816 | * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF. | ||
| 12817 | * If enable_ept, L0 doesn't care about page faults and we should | ||
| 12818 | * set all of these to L1's desires. However, if !enable_ept, L0 does | ||
| 12819 | * care about (at least some) page faults, and because it is not easy | ||
| 12820 | * (if at all possible?) to merge L0 and L1's desires, we simply ask | ||
| 12821 | * to exit on each and every L2 page fault. This is done by setting | ||
| 12822 | * MASK=MATCH=0 and (see below) EB.PF=1. | ||
| 12823 | * Note that below we don't need special code to set EB.PF beyond the | ||
| 12824 | * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept, | ||
| 12825 | * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when | ||
| 12826 | * !enable_ept, EB.PF is 1, so the "or" will always be 1. | ||
| 12827 | */ | ||
| 12828 | vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, | ||
| 12829 | enable_ept ? vmcs12->page_fault_error_code_mask : 0); | ||
| 12830 | vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, | ||
| 12831 | enable_ept ? vmcs12->page_fault_error_code_match : 0); | ||
| 12832 | |||
| 12833 | if (cpu_has_vmx_apicv()) { | ||
| 12834 | vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0); | ||
| 12835 | vmcs_write64(EOI_EXIT_BITMAP1, vmcs12->eoi_exit_bitmap1); | ||
| 12836 | vmcs_write64(EOI_EXIT_BITMAP2, vmcs12->eoi_exit_bitmap2); | ||
| 12837 | vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3); | ||
| 12838 | } | ||
| 12839 | |||
| 12840 | vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); | ||
| 12841 | vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); | ||
| 12842 | |||
| 12843 | set_cr4_guest_host_mask(vmx); | ||
| 12844 | |||
| 12845 | if (kvm_mpx_supported()) { | ||
| 12846 | if (vmx->nested.nested_run_pending && | ||
| 12847 | (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) | ||
| 12848 | vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs); | ||
| 12849 | else | ||
| 12850 | vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs); | ||
| 12851 | } | ||
| 12852 | } | ||
| 12853 | |||
| 12854 | /* | ||
| 12855 | * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested | ||
| 12856 | * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it | ||
| 12857 | * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2 | ||
| 12858 | * guest in a way that will both be appropriate to L1's requests, and our | ||
| 12859 | * needs. In addition to modifying the active vmcs (which is vmcs02), this | ||
| 12860 | * function also has additional necessary side-effects, like setting various | ||
| 12861 | * vcpu->arch fields. | ||
| 12862 | * Returns 0 on success, 1 on failure. Invalid state exit qualification code | ||
| 12863 | * is assigned to entry_failure_code on failure. | ||
| 12864 | */ | ||
| 12865 | static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, | ||
| 12866 | u32 *entry_failure_code) | ||
| 12867 | { | ||
| 12868 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 12869 | struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs; | ||
| 12870 | |||
| 12871 | if (vmx->nested.dirty_vmcs12 || vmx->nested.hv_evmcs) { | ||
| 12872 | prepare_vmcs02_full(vmx, vmcs12); | ||
| 12873 | vmx->nested.dirty_vmcs12 = false; | ||
| 12874 | } | ||
| 12875 | |||
| 12876 | /* | ||
| 12877 | * First, the fields that are shadowed. This must be kept in sync | ||
| 12878 | * with vmx_shadow_fields.h. | ||
| 12879 | */ | ||
| 12880 | if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & | ||
| 12881 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) { | ||
| 12882 | vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes); | ||
| 12883 | vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes); | ||
| 12884 | } | ||
| 12885 | |||
| 12886 | if (vmx->nested.nested_run_pending && | ||
| 12887 | (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) { | ||
| 12888 | kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); | ||
| 12889 | vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl); | ||
| 12890 | } else { | ||
| 12891 | kvm_set_dr(vcpu, 7, vcpu->arch.dr7); | ||
| 12892 | vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl); | ||
| 12893 | } | ||
| 12894 | vmx_set_rflags(vcpu, vmcs12->guest_rflags); | ||
| 12895 | |||
| 12896 | vmx->nested.preemption_timer_expired = false; | ||
| 12897 | if (nested_cpu_has_preemption_timer(vmcs12)) | ||
| 12898 | vmx_start_preemption_timer(vcpu); | ||
| 12899 | |||
| 12900 | /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the | ||
| 12901 | * bitwise-or of what L1 wants to trap for L2, and what we want to | ||
| 12902 | * trap. Note that CR0.TS also needs updating - we do this later. | ||
| 12903 | */ | ||
| 12904 | update_exception_bitmap(vcpu); | ||
| 12905 | vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask; | ||
| 12906 | vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); | ||
| 12907 | |||
| 12908 | if (vmx->nested.nested_run_pending && | ||
| 12909 | (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) { | ||
| 12910 | vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat); | ||
| 12911 | vcpu->arch.pat = vmcs12->guest_ia32_pat; | ||
| 12912 | } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { | ||
| 12913 | vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); | ||
| 12914 | } | ||
| 12915 | |||
| 12916 | vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); | ||
| 12917 | |||
| 12918 | if (kvm_has_tsc_control) | ||
| 12919 | decache_tsc_multiplier(vmx); | ||
| 12920 | |||
| 12921 | if (enable_vpid) { | ||
| 12922 | /* | ||
| 12923 | * There is no direct mapping between vpid02 and vpid12, the | ||
| 12924 | * vpid02 is per-vCPU for L0 and reused while the value of | ||
| 12925 | * vpid12 is changed w/ one invvpid during nested vmentry. | ||
| 12926 | * The vpid12 is allocated by L1 for L2, so it will not | ||
| 12927 | * influence global bitmap(for vpid01 and vpid02 allocation) | ||
| 12928 | * even if spawn a lot of nested vCPUs. | ||
| 12929 | */ | ||
| 12930 | if (nested_cpu_has_vpid(vmcs12) && nested_has_guest_tlb_tag(vcpu)) { | ||
| 12931 | if (vmcs12->virtual_processor_id != vmx->nested.last_vpid) { | ||
| 12932 | vmx->nested.last_vpid = vmcs12->virtual_processor_id; | ||
| 12933 | __vmx_flush_tlb(vcpu, nested_get_vpid02(vcpu), false); | ||
| 12934 | } | ||
| 12935 | } else { | ||
| 12936 | /* | ||
| 12937 | * If L1 use EPT, then L0 needs to execute INVEPT on | ||
| 12938 | * EPTP02 instead of EPTP01. Therefore, delay TLB | ||
| 12939 | * flush until vmcs02->eptp is fully updated by | ||
| 12940 | * KVM_REQ_LOAD_CR3. Note that this assumes | ||
| 12941 | * KVM_REQ_TLB_FLUSH is evaluated after | ||
| 12942 | * KVM_REQ_LOAD_CR3 in vcpu_enter_guest(). | ||
| 12943 | */ | ||
| 12944 | kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); | ||
| 12945 | } | ||
| 12946 | } | ||
| 12947 | |||
| 12948 | if (nested_cpu_has_ept(vmcs12)) | ||
| 12949 | nested_ept_init_mmu_context(vcpu); | ||
| 12950 | else if (nested_cpu_has2(vmcs12, | ||
| 12951 | SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) | ||
| 12952 | vmx_flush_tlb(vcpu, true); | ||
| 12953 | |||
| 12954 | /* | ||
| 12955 | * This sets GUEST_CR0 to vmcs12->guest_cr0, possibly modifying those | ||
| 12956 | * bits which we consider mandatory enabled. | ||
| 12957 | * The CR0_READ_SHADOW is what L2 should have expected to read given | ||
| 12958 | * the specifications by L1; It's not enough to take | ||
| 12959 | * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we we | ||
| 12960 | * have more bits than L1 expected. | ||
| 12961 | */ | ||
| 12962 | vmx_set_cr0(vcpu, vmcs12->guest_cr0); | ||
| 12963 | vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12)); | ||
| 12964 | |||
| 12965 | vmx_set_cr4(vcpu, vmcs12->guest_cr4); | ||
| 12966 | vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12)); | ||
| 12967 | |||
| 12968 | vcpu->arch.efer = nested_vmx_calc_efer(vmx, vmcs12); | ||
| 12969 | /* Note: may modify VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */ | ||
| 12970 | vmx_set_efer(vcpu, vcpu->arch.efer); | ||
| 12971 | |||
| 12972 | /* | ||
| 12973 | * Guest state is invalid and unrestricted guest is disabled, | ||
| 12974 | * which means L1 attempted VMEntry to L2 with invalid state. | ||
| 12975 | * Fail the VMEntry. | ||
| 12976 | */ | ||
| 12977 | if (vmx->emulation_required) { | ||
| 12978 | *entry_failure_code = ENTRY_FAIL_DEFAULT; | ||
| 12979 | return 1; | ||
| 12980 | } | ||
| 12981 | |||
| 12982 | /* Shadow page tables on either EPT or shadow page tables. */ | ||
| 12983 | if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12), | ||
| 12984 | entry_failure_code)) | ||
| 12985 | return 1; | ||
| 12986 | |||
| 12987 | if (!enable_ept) | ||
| 12988 | vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested; | ||
| 12989 | |||
| 12990 | kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->guest_rsp); | ||
| 12991 | kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip); | ||
| 12992 | return 0; | ||
| 12993 | } | ||
| 12994 | |||
| 12995 | static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12) | ||
| 12996 | { | ||
| 12997 | if (!nested_cpu_has_nmi_exiting(vmcs12) && | ||
| 12998 | nested_cpu_has_virtual_nmis(vmcs12)) | ||
| 12999 | return -EINVAL; | ||
| 13000 | |||
| 13001 | if (!nested_cpu_has_virtual_nmis(vmcs12) && | ||
| 13002 | nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING)) | ||
| 13003 | return -EINVAL; | ||
| 13004 | |||
| 13005 | return 0; | ||
| 13006 | } | ||
| 13007 | |||
| 13008 | static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) | ||
| 13009 | { | ||
| 13010 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 13011 | bool ia32e; | ||
| 13012 | |||
| 13013 | if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE && | ||
| 13014 | vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT) | ||
| 13015 | return VMXERR_ENTRY_INVALID_CONTROL_FIELD; | ||
| 13016 | |||
| 13017 | if (nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id) | ||
| 13018 | return VMXERR_ENTRY_INVALID_CONTROL_FIELD; | ||
| 13019 | |||
| 13020 | if (nested_vmx_check_io_bitmap_controls(vcpu, vmcs12)) | ||
| 13021 | return VMXERR_ENTRY_INVALID_CONTROL_FIELD; | ||
| 13022 | |||
| 13023 | if (nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12)) | ||
| 13024 | return VMXERR_ENTRY_INVALID_CONTROL_FIELD; | ||
| 13025 | |||
| 13026 | if (nested_vmx_check_apic_access_controls(vcpu, vmcs12)) | ||
| 13027 | return VMXERR_ENTRY_INVALID_CONTROL_FIELD; | ||
| 13028 | |||
| 13029 | if (nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12)) | ||
| 13030 | return VMXERR_ENTRY_INVALID_CONTROL_FIELD; | ||
| 13031 | |||
| 13032 | if (nested_vmx_check_apicv_controls(vcpu, vmcs12)) | ||
| 13033 | return VMXERR_ENTRY_INVALID_CONTROL_FIELD; | ||
| 13034 | |||
| 13035 | if (nested_vmx_check_msr_switch_controls(vcpu, vmcs12)) | ||
| 13036 | return VMXERR_ENTRY_INVALID_CONTROL_FIELD; | ||
| 13037 | |||
| 13038 | if (nested_vmx_check_pml_controls(vcpu, vmcs12)) | ||
| 13039 | return VMXERR_ENTRY_INVALID_CONTROL_FIELD; | ||
| 13040 | |||
| 13041 | if (nested_vmx_check_shadow_vmcs_controls(vcpu, vmcs12)) | ||
| 13042 | return VMXERR_ENTRY_INVALID_CONTROL_FIELD; | ||
| 13043 | |||
| 13044 | if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control, | ||
| 13045 | vmx->nested.msrs.procbased_ctls_low, | ||
| 13046 | vmx->nested.msrs.procbased_ctls_high) || | ||
| 13047 | (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) && | ||
| 13048 | !vmx_control_verify(vmcs12->secondary_vm_exec_control, | ||
| 13049 | vmx->nested.msrs.secondary_ctls_low, | ||
| 13050 | vmx->nested.msrs.secondary_ctls_high)) || | ||
| 13051 | !vmx_control_verify(vmcs12->pin_based_vm_exec_control, | ||
| 13052 | vmx->nested.msrs.pinbased_ctls_low, | ||
| 13053 | vmx->nested.msrs.pinbased_ctls_high) || | ||
| 13054 | !vmx_control_verify(vmcs12->vm_exit_controls, | ||
| 13055 | vmx->nested.msrs.exit_ctls_low, | ||
| 13056 | vmx->nested.msrs.exit_ctls_high) || | ||
| 13057 | !vmx_control_verify(vmcs12->vm_entry_controls, | ||
| 13058 | vmx->nested.msrs.entry_ctls_low, | ||
| 13059 | vmx->nested.msrs.entry_ctls_high)) | ||
| 13060 | return VMXERR_ENTRY_INVALID_CONTROL_FIELD; | ||
| 13061 | |||
| 13062 | if (nested_vmx_check_nmi_controls(vmcs12)) | ||
| 13063 | return VMXERR_ENTRY_INVALID_CONTROL_FIELD; | ||
| 13064 | |||
| 13065 | if (nested_cpu_has_vmfunc(vmcs12)) { | ||
| 13066 | if (vmcs12->vm_function_control & | ||
| 13067 | ~vmx->nested.msrs.vmfunc_controls) | ||
| 13068 | return VMXERR_ENTRY_INVALID_CONTROL_FIELD; | ||
| 13069 | |||
| 13070 | if (nested_cpu_has_eptp_switching(vmcs12)) { | ||
| 13071 | if (!nested_cpu_has_ept(vmcs12) || | ||
| 13072 | !page_address_valid(vcpu, vmcs12->eptp_list_address)) | ||
| 13073 | return VMXERR_ENTRY_INVALID_CONTROL_FIELD; | ||
| 13074 | } | ||
| 13075 | } | ||
| 13076 | |||
| 13077 | if (vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu)) | ||
| 13078 | return VMXERR_ENTRY_INVALID_CONTROL_FIELD; | ||
| 13079 | |||
| 13080 | if (!nested_host_cr0_valid(vcpu, vmcs12->host_cr0) || | ||
| 13081 | !nested_host_cr4_valid(vcpu, vmcs12->host_cr4) || | ||
| 13082 | !nested_cr3_valid(vcpu, vmcs12->host_cr3)) | ||
| 13083 | return VMXERR_ENTRY_INVALID_HOST_STATE_FIELD; | ||
| 13084 | |||
| 13085 | /* | ||
| 13086 | * If the load IA32_EFER VM-exit control is 1, bits reserved in the | ||
| 13087 | * IA32_EFER MSR must be 0 in the field for that register. In addition, | ||
| 13088 | * the values of the LMA and LME bits in the field must each be that of | ||
| 13089 | * the host address-space size VM-exit control. | ||
| 13090 | */ | ||
| 13091 | if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) { | ||
| 13092 | ia32e = (vmcs12->vm_exit_controls & | ||
| 13093 | VM_EXIT_HOST_ADDR_SPACE_SIZE) != 0; | ||
| 13094 | if (!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer) || | ||
| 13095 | ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA) || | ||
| 13096 | ia32e != !!(vmcs12->host_ia32_efer & EFER_LME)) | ||
| 13097 | return VMXERR_ENTRY_INVALID_HOST_STATE_FIELD; | ||
| 13098 | } | ||
| 13099 | |||
| 13100 | /* | ||
| 13101 | * From the Intel SDM, volume 3: | ||
| 13102 | * Fields relevant to VM-entry event injection must be set properly. | ||
| 13103 | * These fields are the VM-entry interruption-information field, the | ||
| 13104 | * VM-entry exception error code, and the VM-entry instruction length. | ||
| 13105 | */ | ||
| 13106 | if (vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) { | ||
| 13107 | u32 intr_info = vmcs12->vm_entry_intr_info_field; | ||
| 13108 | u8 vector = intr_info & INTR_INFO_VECTOR_MASK; | ||
| 13109 | u32 intr_type = intr_info & INTR_INFO_INTR_TYPE_MASK; | ||
| 13110 | bool has_error_code = intr_info & INTR_INFO_DELIVER_CODE_MASK; | ||
| 13111 | bool should_have_error_code; | ||
| 13112 | bool urg = nested_cpu_has2(vmcs12, | ||
| 13113 | SECONDARY_EXEC_UNRESTRICTED_GUEST); | ||
| 13114 | bool prot_mode = !urg || vmcs12->guest_cr0 & X86_CR0_PE; | ||
| 13115 | |||
| 13116 | /* VM-entry interruption-info field: interruption type */ | ||
| 13117 | if (intr_type == INTR_TYPE_RESERVED || | ||
| 13118 | (intr_type == INTR_TYPE_OTHER_EVENT && | ||
| 13119 | !nested_cpu_supports_monitor_trap_flag(vcpu))) | ||
| 13120 | return VMXERR_ENTRY_INVALID_CONTROL_FIELD; | ||
| 13121 | |||
| 13122 | /* VM-entry interruption-info field: vector */ | ||
| 13123 | if ((intr_type == INTR_TYPE_NMI_INTR && vector != NMI_VECTOR) || | ||
| 13124 | (intr_type == INTR_TYPE_HARD_EXCEPTION && vector > 31) || | ||
| 13125 | (intr_type == INTR_TYPE_OTHER_EVENT && vector != 0)) | ||
| 13126 | return VMXERR_ENTRY_INVALID_CONTROL_FIELD; | ||
| 13127 | |||
| 13128 | /* VM-entry interruption-info field: deliver error code */ | ||
| 13129 | should_have_error_code = | ||
| 13130 | intr_type == INTR_TYPE_HARD_EXCEPTION && prot_mode && | ||
| 13131 | x86_exception_has_error_code(vector); | ||
| 13132 | if (has_error_code != should_have_error_code) | ||
| 13133 | return VMXERR_ENTRY_INVALID_CONTROL_FIELD; | ||
| 13134 | |||
| 13135 | /* VM-entry exception error code */ | ||
| 13136 | if (has_error_code && | ||
| 13137 | vmcs12->vm_entry_exception_error_code & GENMASK(31, 15)) | ||
| 13138 | return VMXERR_ENTRY_INVALID_CONTROL_FIELD; | ||
| 13139 | |||
| 13140 | /* VM-entry interruption-info field: reserved bits */ | ||
| 13141 | if (intr_info & INTR_INFO_RESVD_BITS_MASK) | ||
| 13142 | return VMXERR_ENTRY_INVALID_CONTROL_FIELD; | ||
| 13143 | |||
| 13144 | /* VM-entry instruction length */ | ||
| 13145 | switch (intr_type) { | ||
| 13146 | case INTR_TYPE_SOFT_EXCEPTION: | ||
| 13147 | case INTR_TYPE_SOFT_INTR: | ||
| 13148 | case INTR_TYPE_PRIV_SW_EXCEPTION: | ||
| 13149 | if ((vmcs12->vm_entry_instruction_len > 15) || | ||
| 13150 | (vmcs12->vm_entry_instruction_len == 0 && | ||
| 13151 | !nested_cpu_has_zero_length_injection(vcpu))) | ||
| 13152 | return VMXERR_ENTRY_INVALID_CONTROL_FIELD; | ||
| 13153 | } | ||
| 13154 | } | ||
| 13155 | |||
| 13156 | if (nested_cpu_has_ept(vmcs12) && | ||
| 13157 | !valid_ept_address(vcpu, vmcs12->ept_pointer)) | ||
| 13158 | return VMXERR_ENTRY_INVALID_CONTROL_FIELD; | ||
| 13159 | |||
| 13160 | return 0; | ||
| 13161 | } | ||
| 13162 | |||
| 13163 | static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu, | ||
| 13164 | struct vmcs12 *vmcs12) | ||
| 13165 | { | ||
| 13166 | int r; | ||
| 13167 | struct page *page; | ||
| 13168 | struct vmcs12 *shadow; | ||
| 13169 | |||
| 13170 | if (vmcs12->vmcs_link_pointer == -1ull) | ||
| 13171 | return 0; | ||
| 13172 | |||
| 13173 | if (!page_address_valid(vcpu, vmcs12->vmcs_link_pointer)) | ||
| 13174 | return -EINVAL; | ||
| 13175 | |||
| 13176 | page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->vmcs_link_pointer); | ||
| 13177 | if (is_error_page(page)) | ||
| 13178 | return -EINVAL; | ||
| 13179 | |||
| 13180 | r = 0; | ||
| 13181 | shadow = kmap(page); | ||
| 13182 | if (shadow->hdr.revision_id != VMCS12_REVISION || | ||
| 13183 | shadow->hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12)) | ||
| 13184 | r = -EINVAL; | ||
| 13185 | kunmap(page); | ||
| 13186 | kvm_release_page_clean(page); | ||
| 13187 | return r; | ||
| 13188 | } | ||
| 13189 | |||
| 13190 | static int check_vmentry_postreqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, | ||
| 13191 | u32 *exit_qual) | ||
| 13192 | { | ||
| 13193 | bool ia32e; | ||
| 13194 | |||
| 13195 | *exit_qual = ENTRY_FAIL_DEFAULT; | ||
| 13196 | |||
| 13197 | if (!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0) || | ||
| 13198 | !nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4)) | ||
| 13199 | return 1; | ||
| 13200 | |||
| 13201 | if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) { | ||
| 13202 | *exit_qual = ENTRY_FAIL_VMCS_LINK_PTR; | ||
| 13203 | return 1; | ||
| 13204 | } | ||
| 13205 | |||
| 13206 | /* | ||
| 13207 | * If the load IA32_EFER VM-entry control is 1, the following checks | ||
| 13208 | * are performed on the field for the IA32_EFER MSR: | ||
| 13209 | * - Bits reserved in the IA32_EFER MSR must be 0. | ||
| 13210 | * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of | ||
| 13211 | * the IA-32e mode guest VM-exit control. It must also be identical | ||
| 13212 | * to bit 8 (LME) if bit 31 in the CR0 field (corresponding to | ||
| 13213 | * CR0.PG) is 1. | ||
| 13214 | */ | ||
| 13215 | if (to_vmx(vcpu)->nested.nested_run_pending && | ||
| 13216 | (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) { | ||
| 13217 | ia32e = (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) != 0; | ||
| 13218 | if (!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer) || | ||
| 13219 | ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA) || | ||
| 13220 | ((vmcs12->guest_cr0 & X86_CR0_PG) && | ||
| 13221 | ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME))) | ||
| 13222 | return 1; | ||
| 13223 | } | ||
| 13224 | |||
| 13225 | if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) && | ||
| 13226 | (is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu) || | ||
| 13227 | (vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD))) | ||
| 13228 | return 1; | ||
| 13229 | |||
| 13230 | return 0; | ||
| 13231 | } | ||
| 13232 | |||
| 13233 | static int __noclone nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) | ||
| 13234 | { | ||
| 13235 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 13236 | unsigned long cr3, cr4; | ||
| 13237 | |||
| 13238 | if (!nested_early_check) | ||
| 13239 | return 0; | ||
| 13240 | |||
| 13241 | if (vmx->msr_autoload.host.nr) | ||
| 13242 | vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); | ||
| 13243 | if (vmx->msr_autoload.guest.nr) | ||
| 13244 | vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); | ||
| 13245 | |||
| 13246 | preempt_disable(); | ||
| 13247 | |||
| 13248 | vmx_prepare_switch_to_guest(vcpu); | ||
| 13249 | |||
| 13250 | /* | ||
| 13251 | * Induce a consistency check VMExit by clearing bit 1 in GUEST_RFLAGS, | ||
| 13252 | * which is reserved to '1' by hardware. GUEST_RFLAGS is guaranteed to | ||
| 13253 | * be written (by preparve_vmcs02()) before the "real" VMEnter, i.e. | ||
| 13254 | * there is no need to preserve other bits or save/restore the field. | ||
| 13255 | */ | ||
| 13256 | vmcs_writel(GUEST_RFLAGS, 0); | ||
| 13257 | |||
| 13258 | vmcs_writel(HOST_RIP, vmx_early_consistency_check_return); | ||
| 13259 | |||
| 13260 | cr3 = __get_current_cr3_fast(); | ||
| 13261 | if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) { | ||
| 13262 | vmcs_writel(HOST_CR3, cr3); | ||
| 13263 | vmx->loaded_vmcs->host_state.cr3 = cr3; | ||
| 13264 | } | ||
| 13265 | |||
| 13266 | cr4 = cr4_read_shadow(); | ||
| 13267 | if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) { | ||
| 13268 | vmcs_writel(HOST_CR4, cr4); | ||
| 13269 | vmx->loaded_vmcs->host_state.cr4 = cr4; | ||
| 13270 | } | ||
| 13271 | |||
| 13272 | vmx->__launched = vmx->loaded_vmcs->launched; | ||
| 13273 | |||
| 13274 | asm( | ||
| 13275 | /* Set HOST_RSP */ | ||
| 13276 | __ex("vmwrite %%" _ASM_SP ", %%" _ASM_DX) "\n\t" | ||
| 13277 | "mov %%" _ASM_SP ", %c[host_rsp](%0)\n\t" | ||
| 13278 | |||
| 13279 | /* Check if vmlaunch of vmresume is needed */ | ||
| 13280 | "cmpl $0, %c[launched](%0)\n\t" | ||
| 13281 | "je 1f\n\t" | ||
| 13282 | __ex("vmresume") "\n\t" | ||
| 13283 | "jmp 2f\n\t" | ||
| 13284 | "1: " __ex("vmlaunch") "\n\t" | ||
| 13285 | "jmp 2f\n\t" | ||
| 13286 | "2: " | ||
| 13287 | |||
| 13288 | /* Set vmx->fail accordingly */ | ||
| 13289 | "setbe %c[fail](%0)\n\t" | ||
| 13290 | |||
| 13291 | ".pushsection .rodata\n\t" | ||
| 13292 | ".global vmx_early_consistency_check_return\n\t" | ||
| 13293 | "vmx_early_consistency_check_return: " _ASM_PTR " 2b\n\t" | ||
| 13294 | ".popsection" | ||
| 13295 | : | ||
| 13296 | : "c"(vmx), "d"((unsigned long)HOST_RSP), | ||
| 13297 | [launched]"i"(offsetof(struct vcpu_vmx, __launched)), | ||
| 13298 | [fail]"i"(offsetof(struct vcpu_vmx, fail)), | ||
| 13299 | [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)) | ||
| 13300 | : "rax", "cc", "memory" | ||
| 13301 | ); | ||
| 13302 | |||
| 13303 | vmcs_writel(HOST_RIP, vmx_return); | ||
| 13304 | |||
| 13305 | preempt_enable(); | ||
| 13306 | |||
| 13307 | if (vmx->msr_autoload.host.nr) | ||
| 13308 | vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); | ||
| 13309 | if (vmx->msr_autoload.guest.nr) | ||
| 13310 | vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); | ||
| 13311 | |||
| 13312 | if (vmx->fail) { | ||
| 13313 | WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) != | ||
| 13314 | VMXERR_ENTRY_INVALID_CONTROL_FIELD); | ||
| 13315 | vmx->fail = 0; | ||
| 13316 | return 1; | ||
| 13317 | } | ||
| 13318 | |||
| 13319 | /* | ||
| 13320 | * VMExit clears RFLAGS.IF and DR7, even on a consistency check. | ||
| 13321 | */ | ||
| 13322 | local_irq_enable(); | ||
| 13323 | if (hw_breakpoint_active()) | ||
| 13324 | set_debugreg(__this_cpu_read(cpu_dr7), 7); | ||
| 13325 | |||
| 13326 | /* | ||
| 13327 | * A non-failing VMEntry means we somehow entered guest mode with | ||
| 13328 | * an illegal RIP, and that's just the tip of the iceberg. There | ||
| 13329 | * is no telling what memory has been modified or what state has | ||
| 13330 | * been exposed to unknown code. Hitting this all but guarantees | ||
| 13331 | * a (very critical) hardware issue. | ||
| 13332 | */ | ||
| 13333 | WARN_ON(!(vmcs_read32(VM_EXIT_REASON) & | ||
| 13334 | VMX_EXIT_REASONS_FAILED_VMENTRY)); | ||
| 13335 | |||
| 13336 | return 0; | ||
| 13337 | } | ||
| 13338 | STACK_FRAME_NON_STANDARD(nested_vmx_check_vmentry_hw); | ||
| 13339 | |||
| 13340 | static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, | ||
| 13341 | struct vmcs12 *vmcs12); | ||
| 13342 | |||
| 13343 | /* | ||
| 13344 | * If from_vmentry is false, this is being called from state restore (either RSM | ||
| 13345 | * or KVM_SET_NESTED_STATE). Otherwise it's called from vmlaunch/vmresume. | ||
| 13346 | + * | ||
| 13347 | + * Returns: | ||
| 13348 | + * 0 - success, i.e. proceed with actual VMEnter | ||
| 13349 | + * 1 - consistency check VMExit | ||
| 13350 | + * -1 - consistency check VMFail | ||
| 13351 | */ | ||
| 13352 | static int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, | ||
| 13353 | bool from_vmentry) | ||
| 13354 | { | ||
| 13355 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 13356 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); | ||
| 13357 | bool evaluate_pending_interrupts; | ||
| 13358 | u32 exit_reason = EXIT_REASON_INVALID_STATE; | ||
| 13359 | u32 exit_qual; | ||
| 13360 | |||
| 13361 | evaluate_pending_interrupts = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) & | ||
| 13362 | (CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_VIRTUAL_NMI_PENDING); | ||
| 13363 | if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu)) | ||
| 13364 | evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu); | ||
| 13365 | |||
| 13366 | if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) | ||
| 13367 | vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); | ||
| 13368 | if (kvm_mpx_supported() && | ||
| 13369 | !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) | ||
| 13370 | vmx->nested.vmcs01_guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); | ||
| 13371 | |||
| 13372 | vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02); | ||
| 13373 | |||
| 13374 | prepare_vmcs02_early(vmx, vmcs12); | ||
| 13375 | |||
| 13376 | if (from_vmentry) { | ||
| 13377 | nested_get_vmcs12_pages(vcpu); | ||
| 13378 | |||
| 13379 | if (nested_vmx_check_vmentry_hw(vcpu)) { | ||
| 13380 | vmx_switch_vmcs(vcpu, &vmx->vmcs01); | ||
| 13381 | return -1; | ||
| 13382 | } | ||
| 13383 | |||
| 13384 | if (check_vmentry_postreqs(vcpu, vmcs12, &exit_qual)) | ||
| 13385 | goto vmentry_fail_vmexit; | ||
| 13386 | } | ||
| 13387 | |||
| 13388 | enter_guest_mode(vcpu); | ||
| 13389 | if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) | ||
| 13390 | vcpu->arch.tsc_offset += vmcs12->tsc_offset; | ||
| 13391 | |||
| 13392 | if (prepare_vmcs02(vcpu, vmcs12, &exit_qual)) | ||
| 13393 | goto vmentry_fail_vmexit_guest_mode; | ||
| 13394 | |||
| 13395 | if (from_vmentry) { | ||
| 13396 | exit_reason = EXIT_REASON_MSR_LOAD_FAIL; | ||
| 13397 | exit_qual = nested_vmx_load_msr(vcpu, | ||
| 13398 | vmcs12->vm_entry_msr_load_addr, | ||
| 13399 | vmcs12->vm_entry_msr_load_count); | ||
| 13400 | if (exit_qual) | ||
| 13401 | goto vmentry_fail_vmexit_guest_mode; | ||
| 13402 | } else { | ||
| 13403 | /* | ||
| 13404 | * The MMU is not initialized to point at the right entities yet and | ||
| 13405 | * "get pages" would need to read data from the guest (i.e. we will | ||
| 13406 | * need to perform gpa to hpa translation). Request a call | ||
| 13407 | * to nested_get_vmcs12_pages before the next VM-entry. The MSRs | ||
| 13408 | * have already been set at vmentry time and should not be reset. | ||
| 13409 | */ | ||
| 13410 | kvm_make_request(KVM_REQ_GET_VMCS12_PAGES, vcpu); | ||
| 13411 | } | ||
| 13412 | |||
| 13413 | /* | ||
| 13414 | * If L1 had a pending IRQ/NMI until it executed | ||
| 13415 | * VMLAUNCH/VMRESUME which wasn't delivered because it was | ||
| 13416 | * disallowed (e.g. interrupts disabled), L0 needs to | ||
| 13417 | * evaluate if this pending event should cause an exit from L2 | ||
| 13418 | * to L1 or delivered directly to L2 (e.g. In case L1 don't | ||
| 13419 | * intercept EXTERNAL_INTERRUPT). | ||
| 13420 | * | ||
| 13421 | * Usually this would be handled by the processor noticing an | ||
| 13422 | * IRQ/NMI window request, or checking RVI during evaluation of | ||
| 13423 | * pending virtual interrupts. However, this setting was done | ||
| 13424 | * on VMCS01 and now VMCS02 is active instead. Thus, we force L0 | ||
| 13425 | * to perform pending event evaluation by requesting a KVM_REQ_EVENT. | ||
| 13426 | */ | ||
| 13427 | if (unlikely(evaluate_pending_interrupts)) | ||
| 13428 | kvm_make_request(KVM_REQ_EVENT, vcpu); | ||
| 13429 | |||
| 13430 | /* | ||
| 13431 | * Note no nested_vmx_succeed or nested_vmx_fail here. At this point | ||
| 13432 | * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet | ||
| 13433 | * returned as far as L1 is concerned. It will only return (and set | ||
| 13434 | * the success flag) when L2 exits (see nested_vmx_vmexit()). | ||
| 13435 | */ | ||
| 13436 | return 0; | ||
| 13437 | |||
| 13438 | /* | ||
| 13439 | * A failed consistency check that leads to a VMExit during L1's | ||
| 13440 | * VMEnter to L2 is a variation of a normal VMexit, as explained in | ||
| 13441 | * 26.7 "VM-entry failures during or after loading guest state". | ||
| 13442 | */ | ||
| 13443 | vmentry_fail_vmexit_guest_mode: | ||
| 13444 | if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) | ||
| 13445 | vcpu->arch.tsc_offset -= vmcs12->tsc_offset; | ||
| 13446 | leave_guest_mode(vcpu); | ||
| 13447 | |||
| 13448 | vmentry_fail_vmexit: | ||
| 13449 | vmx_switch_vmcs(vcpu, &vmx->vmcs01); | ||
| 13450 | |||
| 13451 | if (!from_vmentry) | ||
| 13452 | return 1; | ||
| 13453 | |||
| 13454 | load_vmcs12_host_state(vcpu, vmcs12); | ||
| 13455 | vmcs12->vm_exit_reason = exit_reason | VMX_EXIT_REASONS_FAILED_VMENTRY; | ||
| 13456 | vmcs12->exit_qualification = exit_qual; | ||
| 13457 | if (enable_shadow_vmcs || vmx->nested.hv_evmcs) | ||
| 13458 | vmx->nested.need_vmcs12_sync = true; | ||
| 13459 | return 1; | ||
| 13460 | } | ||
| 13461 | |||
| 13462 | /* | ||
| 13463 | * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1 | ||
| 13464 | * for running an L2 nested guest. | ||
| 13465 | */ | ||
| 13466 | static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) | ||
| 13467 | { | ||
| 13468 | struct vmcs12 *vmcs12; | ||
| 13469 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 13470 | u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu); | ||
| 13471 | int ret; | ||
| 13472 | |||
| 13473 | if (!nested_vmx_check_permission(vcpu)) | ||
| 13474 | return 1; | ||
| 13475 | |||
| 13476 | if (!nested_vmx_handle_enlightened_vmptrld(vcpu, true)) | ||
| 13477 | return 1; | ||
| 13478 | |||
| 13479 | if (!vmx->nested.hv_evmcs && vmx->nested.current_vmptr == -1ull) | ||
| 13480 | return nested_vmx_failInvalid(vcpu); | ||
| 13481 | |||
| 13482 | vmcs12 = get_vmcs12(vcpu); | ||
| 13483 | |||
| 13484 | /* | ||
| 13485 | * Can't VMLAUNCH or VMRESUME a shadow VMCS. Despite the fact | ||
| 13486 | * that there *is* a valid VMCS pointer, RFLAGS.CF is set | ||
| 13487 | * rather than RFLAGS.ZF, and no error number is stored to the | ||
| 13488 | * VM-instruction error field. | ||
| 13489 | */ | ||
| 13490 | if (vmcs12->hdr.shadow_vmcs) | ||
| 13491 | return nested_vmx_failInvalid(vcpu); | ||
| 13492 | |||
| 13493 | if (vmx->nested.hv_evmcs) { | ||
| 13494 | copy_enlightened_to_vmcs12(vmx); | ||
| 13495 | /* Enlightened VMCS doesn't have launch state */ | ||
| 13496 | vmcs12->launch_state = !launch; | ||
| 13497 | } else if (enable_shadow_vmcs) { | ||
| 13498 | copy_shadow_to_vmcs12(vmx); | ||
| 13499 | } | ||
| 13500 | |||
| 13501 | /* | ||
| 13502 | * The nested entry process starts with enforcing various prerequisites | ||
| 13503 | * on vmcs12 as required by the Intel SDM, and act appropriately when | ||
| 13504 | * they fail: As the SDM explains, some conditions should cause the | ||
| 13505 | * instruction to fail, while others will cause the instruction to seem | ||
| 13506 | * to succeed, but return an EXIT_REASON_INVALID_STATE. | ||
| 13507 | * To speed up the normal (success) code path, we should avoid checking | ||
| 13508 | * for misconfigurations which will anyway be caught by the processor | ||
| 13509 | * when using the merged vmcs02. | ||
| 13510 | */ | ||
| 13511 | if (interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS) | ||
| 13512 | return nested_vmx_failValid(vcpu, | ||
| 13513 | VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS); | ||
| 13514 | |||
| 13515 | if (vmcs12->launch_state == launch) | ||
| 13516 | return nested_vmx_failValid(vcpu, | ||
| 13517 | launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS | ||
| 13518 | : VMXERR_VMRESUME_NONLAUNCHED_VMCS); | ||
| 13519 | |||
| 13520 | ret = check_vmentry_prereqs(vcpu, vmcs12); | ||
| 13521 | if (ret) | ||
| 13522 | return nested_vmx_failValid(vcpu, ret); | ||
| 13523 | |||
| 13524 | /* | ||
| 13525 | * We're finally done with prerequisite checking, and can start with | ||
| 13526 | * the nested entry. | ||
| 13527 | */ | ||
| 13528 | vmx->nested.nested_run_pending = 1; | ||
| 13529 | ret = nested_vmx_enter_non_root_mode(vcpu, true); | ||
| 13530 | vmx->nested.nested_run_pending = !ret; | ||
| 13531 | if (ret > 0) | ||
| 13532 | return 1; | ||
| 13533 | else if (ret) | ||
| 13534 | return nested_vmx_failValid(vcpu, | ||
| 13535 | VMXERR_ENTRY_INVALID_CONTROL_FIELD); | ||
| 13536 | |||
| 13537 | /* Hide L1D cache contents from the nested guest. */ | ||
| 13538 | vmx->vcpu.arch.l1tf_flush_l1d = true; | ||
| 13539 | |||
| 13540 | /* | ||
| 13541 | * Must happen outside of nested_vmx_enter_non_root_mode() as it will | ||
| 13542 | * also be used as part of restoring nVMX state for | ||
| 13543 | * snapshot restore (migration). | ||
| 13544 | * | ||
| 13545 | * In this flow, it is assumed that vmcs12 cache was | ||
| 13546 | * trasferred as part of captured nVMX state and should | ||
| 13547 | * therefore not be read from guest memory (which may not | ||
| 13548 | * exist on destination host yet). | ||
| 13549 | */ | ||
| 13550 | nested_cache_shadow_vmcs12(vcpu, vmcs12); | ||
| 13551 | |||
| 13552 | /* | ||
| 13553 | * If we're entering a halted L2 vcpu and the L2 vcpu won't be woken | ||
| 13554 | * by event injection, halt vcpu. | ||
| 13555 | */ | ||
| 13556 | if ((vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT) && | ||
| 13557 | !(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK)) { | ||
| 13558 | vmx->nested.nested_run_pending = 0; | ||
| 13559 | return kvm_vcpu_halt(vcpu); | ||
| 13560 | } | ||
| 13561 | return 1; | ||
| 13562 | } | ||
| 13563 | |||
| 13564 | /* | ||
| 13565 | * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date | ||
| 13566 | * because L2 may have changed some cr0 bits directly (CRO_GUEST_HOST_MASK). | ||
| 13567 | * This function returns the new value we should put in vmcs12.guest_cr0. | ||
| 13568 | * It's not enough to just return the vmcs02 GUEST_CR0. Rather, | ||
| 13569 | * 1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now | ||
| 13570 | * available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0 | ||
| 13571 | * didn't trap the bit, because if L1 did, so would L0). | ||
| 13572 | * 2. Bits that L1 asked to trap (and therefore L0 also did) could not have | ||
| 13573 | * been modified by L2, and L1 knows it. So just leave the old value of | ||
| 13574 | * the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0 | ||
| 13575 | * isn't relevant, because if L0 traps this bit it can set it to anything. | ||
| 13576 | * 3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have | ||
| 13577 | * changed these bits, and therefore they need to be updated, but L0 | ||
| 13578 | * didn't necessarily allow them to be changed in GUEST_CR0 - and rather | ||
| 13579 | * put them in vmcs02 CR0_READ_SHADOW. So take these bits from there. | ||
| 13580 | */ | ||
| 13581 | static inline unsigned long | ||
| 13582 | vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) | ||
| 13583 | { | ||
| 13584 | return | ||
| 13585 | /*1*/ (vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) | | ||
| 13586 | /*2*/ (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) | | ||
| 13587 | /*3*/ (vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask | | ||
| 13588 | vcpu->arch.cr0_guest_owned_bits)); | ||
| 13589 | } | ||
| 13590 | |||
| 13591 | static inline unsigned long | ||
| 13592 | vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) | ||
| 13593 | { | ||
| 13594 | return | ||
| 13595 | /*1*/ (vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) | | ||
| 13596 | /*2*/ (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) | | ||
| 13597 | /*3*/ (vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask | | ||
| 13598 | vcpu->arch.cr4_guest_owned_bits)); | ||
| 13599 | } | ||
| 13600 | |||
| 13601 | static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu, | ||
| 13602 | struct vmcs12 *vmcs12) | ||
| 13603 | { | ||
| 13604 | u32 idt_vectoring; | ||
| 13605 | unsigned int nr; | ||
| 13606 | |||
| 13607 | if (vcpu->arch.exception.injected) { | ||
| 13608 | nr = vcpu->arch.exception.nr; | ||
| 13609 | idt_vectoring = nr | VECTORING_INFO_VALID_MASK; | ||
| 13610 | |||
| 13611 | if (kvm_exception_is_soft(nr)) { | ||
| 13612 | vmcs12->vm_exit_instruction_len = | ||
| 13613 | vcpu->arch.event_exit_inst_len; | ||
| 13614 | idt_vectoring |= INTR_TYPE_SOFT_EXCEPTION; | ||
| 13615 | } else | ||
| 13616 | idt_vectoring |= INTR_TYPE_HARD_EXCEPTION; | ||
| 13617 | |||
| 13618 | if (vcpu->arch.exception.has_error_code) { | ||
| 13619 | idt_vectoring |= VECTORING_INFO_DELIVER_CODE_MASK; | ||
| 13620 | vmcs12->idt_vectoring_error_code = | ||
| 13621 | vcpu->arch.exception.error_code; | ||
| 13622 | } | ||
| 13623 | |||
| 13624 | vmcs12->idt_vectoring_info_field = idt_vectoring; | ||
| 13625 | } else if (vcpu->arch.nmi_injected) { | ||
| 13626 | vmcs12->idt_vectoring_info_field = | ||
| 13627 | INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR; | ||
| 13628 | } else if (vcpu->arch.interrupt.injected) { | ||
| 13629 | nr = vcpu->arch.interrupt.nr; | ||
| 13630 | idt_vectoring = nr | VECTORING_INFO_VALID_MASK; | ||
| 13631 | |||
| 13632 | if (vcpu->arch.interrupt.soft) { | ||
| 13633 | idt_vectoring |= INTR_TYPE_SOFT_INTR; | ||
| 13634 | vmcs12->vm_entry_instruction_len = | ||
| 13635 | vcpu->arch.event_exit_inst_len; | ||
| 13636 | } else | ||
| 13637 | idt_vectoring |= INTR_TYPE_EXT_INTR; | ||
| 13638 | |||
| 13639 | vmcs12->idt_vectoring_info_field = idt_vectoring; | ||
| 13640 | } | ||
| 13641 | } | ||
| 13642 | |||
| 13643 | static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr) | ||
| 13644 | { | ||
| 13645 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 13646 | unsigned long exit_qual; | ||
| 13647 | bool block_nested_events = | ||
| 13648 | vmx->nested.nested_run_pending || kvm_event_needs_reinjection(vcpu); | ||
| 13649 | |||
| 13650 | if (vcpu->arch.exception.pending && | ||
| 13651 | nested_vmx_check_exception(vcpu, &exit_qual)) { | ||
| 13652 | if (block_nested_events) | ||
| 13653 | return -EBUSY; | ||
| 13654 | nested_vmx_inject_exception_vmexit(vcpu, exit_qual); | ||
| 13655 | return 0; | ||
| 13656 | } | ||
| 13657 | |||
| 13658 | if (nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) && | ||
| 13659 | vmx->nested.preemption_timer_expired) { | ||
| 13660 | if (block_nested_events) | ||
| 13661 | return -EBUSY; | ||
| 13662 | nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0); | ||
| 13663 | return 0; | ||
| 13664 | } | ||
| 13665 | |||
| 13666 | if (vcpu->arch.nmi_pending && nested_exit_on_nmi(vcpu)) { | ||
| 13667 | if (block_nested_events) | ||
| 13668 | return -EBUSY; | ||
| 13669 | nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, | ||
| 13670 | NMI_VECTOR | INTR_TYPE_NMI_INTR | | ||
| 13671 | INTR_INFO_VALID_MASK, 0); | ||
| 13672 | /* | ||
| 13673 | * The NMI-triggered VM exit counts as injection: | ||
| 13674 | * clear this one and block further NMIs. | ||
| 13675 | */ | ||
| 13676 | vcpu->arch.nmi_pending = 0; | ||
| 13677 | vmx_set_nmi_mask(vcpu, true); | ||
| 13678 | return 0; | ||
| 13679 | } | ||
| 13680 | |||
| 13681 | if ((kvm_cpu_has_interrupt(vcpu) || external_intr) && | ||
| 13682 | nested_exit_on_intr(vcpu)) { | ||
| 13683 | if (block_nested_events) | ||
| 13684 | return -EBUSY; | ||
| 13685 | nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0); | ||
| 13686 | return 0; | ||
| 13687 | } | ||
| 13688 | |||
| 13689 | vmx_complete_nested_posted_interrupt(vcpu); | ||
| 13690 | return 0; | ||
| 13691 | } | ||
| 13692 | |||
| 13693 | static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu) | ||
| 13694 | { | ||
| 13695 | to_vmx(vcpu)->req_immediate_exit = true; | ||
| 13696 | } | ||
| 13697 | |||
| 13698 | static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu) | ||
| 13699 | { | ||
| 13700 | ktime_t remaining = | ||
| 13701 | hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer); | ||
| 13702 | u64 value; | ||
| 13703 | |||
| 13704 | if (ktime_to_ns(remaining) <= 0) | ||
| 13705 | return 0; | ||
| 13706 | |||
| 13707 | value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz; | ||
| 13708 | do_div(value, 1000000); | ||
| 13709 | return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; | ||
| 13710 | } | ||
| 13711 | |||
| 13712 | /* | ||
| 13713 | * Update the guest state fields of vmcs12 to reflect changes that | ||
| 13714 | * occurred while L2 was running. (The "IA-32e mode guest" bit of the | ||
| 13715 | * VM-entry controls is also updated, since this is really a guest | ||
| 13716 | * state bit.) | ||
| 13717 | */ | ||
| 13718 | static void sync_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) | ||
| 13719 | { | ||
| 13720 | vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12); | ||
| 13721 | vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12); | ||
| 13722 | |||
| 13723 | vmcs12->guest_rsp = kvm_register_read(vcpu, VCPU_REGS_RSP); | ||
| 13724 | vmcs12->guest_rip = kvm_register_read(vcpu, VCPU_REGS_RIP); | ||
| 13725 | vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS); | ||
| 13726 | |||
| 13727 | vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR); | ||
| 13728 | vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR); | ||
| 13729 | vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR); | ||
| 13730 | vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR); | ||
| 13731 | vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR); | ||
| 13732 | vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR); | ||
| 13733 | vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR); | ||
| 13734 | vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR); | ||
| 13735 | vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT); | ||
| 13736 | vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT); | ||
| 13737 | vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT); | ||
| 13738 | vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT); | ||
| 13739 | vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT); | ||
| 13740 | vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT); | ||
| 13741 | vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT); | ||
| 13742 | vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT); | ||
| 13743 | vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT); | ||
| 13744 | vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT); | ||
| 13745 | vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES); | ||
| 13746 | vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES); | ||
| 13747 | vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES); | ||
| 13748 | vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES); | ||
| 13749 | vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES); | ||
| 13750 | vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES); | ||
| 13751 | vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES); | ||
| 13752 | vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES); | ||
| 13753 | vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE); | ||
| 13754 | vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE); | ||
| 13755 | vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE); | ||
| 13756 | vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE); | ||
| 13757 | vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE); | ||
| 13758 | vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE); | ||
| 13759 | vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE); | ||
| 13760 | vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE); | ||
| 13761 | vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE); | ||
| 13762 | vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE); | ||
| 13763 | |||
| 13764 | vmcs12->guest_interruptibility_info = | ||
| 13765 | vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); | ||
| 13766 | vmcs12->guest_pending_dbg_exceptions = | ||
| 13767 | vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS); | ||
| 13768 | if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) | ||
| 13769 | vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT; | ||
| 13770 | else | ||
| 13771 | vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE; | ||
| 13772 | |||
| 13773 | if (nested_cpu_has_preemption_timer(vmcs12)) { | ||
| 13774 | if (vmcs12->vm_exit_controls & | ||
| 13775 | VM_EXIT_SAVE_VMX_PREEMPTION_TIMER) | ||
| 13776 | vmcs12->vmx_preemption_timer_value = | ||
| 13777 | vmx_get_preemption_timer_value(vcpu); | ||
| 13778 | hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer); | ||
| 13779 | } | ||
| 13780 | |||
| 13781 | /* | ||
| 13782 | * In some cases (usually, nested EPT), L2 is allowed to change its | ||
| 13783 | * own CR3 without exiting. If it has changed it, we must keep it. | ||
| 13784 | * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined | ||
| 13785 | * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12. | ||
| 13786 | * | ||
| 13787 | * Additionally, restore L2's PDPTR to vmcs12. | ||
| 13788 | */ | ||
| 13789 | if (enable_ept) { | ||
| 13790 | vmcs12->guest_cr3 = vmcs_readl(GUEST_CR3); | ||
| 13791 | vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0); | ||
| 13792 | vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1); | ||
| 13793 | vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2); | ||
| 13794 | vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3); | ||
| 13795 | } | ||
| 13796 | |||
| 13797 | vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS); | ||
| 13798 | |||
| 13799 | if (nested_cpu_has_vid(vmcs12)) | ||
| 13800 | vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS); | ||
| 13801 | |||
| 13802 | vmcs12->vm_entry_controls = | ||
| 13803 | (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) | | ||
| 13804 | (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE); | ||
| 13805 | |||
| 13806 | if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) { | ||
| 13807 | kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7); | ||
| 13808 | vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); | ||
| 13809 | } | ||
| 13810 | |||
| 13811 | /* TODO: These cannot have changed unless we have MSR bitmaps and | ||
| 13812 | * the relevant bit asks not to trap the change */ | ||
| 13813 | if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT) | ||
| 13814 | vmcs12->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT); | ||
| 13815 | if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER) | ||
| 13816 | vmcs12->guest_ia32_efer = vcpu->arch.efer; | ||
| 13817 | vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS); | ||
| 13818 | vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP); | ||
| 13819 | vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP); | ||
| 13820 | if (kvm_mpx_supported()) | ||
| 13821 | vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); | ||
| 13822 | } | ||
| 13823 | |||
| 13824 | /* | ||
| 13825 | * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits | ||
| 13826 | * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12), | ||
| 13827 | * and this function updates it to reflect the changes to the guest state while | ||
| 13828 | * L2 was running (and perhaps made some exits which were handled directly by L0 | ||
| 13829 | * without going back to L1), and to reflect the exit reason. | ||
| 13830 | * Note that we do not have to copy here all VMCS fields, just those that | ||
| 13831 | * could have changed by the L2 guest or the exit - i.e., the guest-state and | ||
| 13832 | * exit-information fields only. Other fields are modified by L1 with VMWRITE, | ||
| 13833 | * which already writes to vmcs12 directly. | ||
| 13834 | */ | ||
| 13835 | static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, | ||
| 13836 | u32 exit_reason, u32 exit_intr_info, | ||
| 13837 | unsigned long exit_qualification) | ||
| 13838 | { | ||
| 13839 | /* update guest state fields: */ | ||
| 13840 | sync_vmcs12(vcpu, vmcs12); | ||
| 13841 | |||
| 13842 | /* update exit information fields: */ | ||
| 13843 | |||
| 13844 | vmcs12->vm_exit_reason = exit_reason; | ||
| 13845 | vmcs12->exit_qualification = exit_qualification; | ||
| 13846 | vmcs12->vm_exit_intr_info = exit_intr_info; | ||
| 13847 | |||
| 13848 | vmcs12->idt_vectoring_info_field = 0; | ||
| 13849 | vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); | ||
| 13850 | vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); | ||
| 13851 | |||
| 13852 | if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) { | ||
| 13853 | vmcs12->launch_state = 1; | ||
| 13854 | |||
| 13855 | /* vm_entry_intr_info_field is cleared on exit. Emulate this | ||
| 13856 | * instead of reading the real value. */ | ||
| 13857 | vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK; | ||
| 13858 | |||
| 13859 | /* | ||
| 13860 | * Transfer the event that L0 or L1 may wanted to inject into | ||
| 13861 | * L2 to IDT_VECTORING_INFO_FIELD. | ||
| 13862 | */ | ||
| 13863 | vmcs12_save_pending_event(vcpu, vmcs12); | ||
| 13864 | } | ||
| 13865 | |||
| 13866 | /* | ||
| 13867 | * Drop what we picked up for L2 via vmx_complete_interrupts. It is | ||
| 13868 | * preserved above and would only end up incorrectly in L1. | ||
| 13869 | */ | ||
| 13870 | vcpu->arch.nmi_injected = false; | ||
| 13871 | kvm_clear_exception_queue(vcpu); | ||
| 13872 | kvm_clear_interrupt_queue(vcpu); | ||
| 13873 | } | ||
| 13874 | |||
| 13875 | /* | ||
| 13876 | * A part of what we need to when the nested L2 guest exits and we want to | ||
| 13877 | * run its L1 parent, is to reset L1's guest state to the host state specified | ||
| 13878 | * in vmcs12. | ||
| 13879 | * This function is to be called not only on normal nested exit, but also on | ||
| 13880 | * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry | ||
| 13881 | * Failures During or After Loading Guest State"). | ||
| 13882 | * This function should be called when the active VMCS is L1's (vmcs01). | ||
| 13883 | */ | ||
| 13884 | static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, | ||
| 13885 | struct vmcs12 *vmcs12) | ||
| 13886 | { | ||
| 13887 | struct kvm_segment seg; | ||
| 13888 | u32 entry_failure_code; | ||
| 13889 | |||
| 13890 | if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) | ||
| 13891 | vcpu->arch.efer = vmcs12->host_ia32_efer; | ||
| 13892 | else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) | ||
| 13893 | vcpu->arch.efer |= (EFER_LMA | EFER_LME); | ||
| 13894 | else | ||
| 13895 | vcpu->arch.efer &= ~(EFER_LMA | EFER_LME); | ||
| 13896 | vmx_set_efer(vcpu, vcpu->arch.efer); | ||
| 13897 | |||
| 13898 | kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->host_rsp); | ||
| 13899 | kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->host_rip); | ||
| 13900 | vmx_set_rflags(vcpu, X86_EFLAGS_FIXED); | ||
| 13901 | vmx_set_interrupt_shadow(vcpu, 0); | ||
| 13902 | |||
| 13903 | /* | ||
| 13904 | * Note that calling vmx_set_cr0 is important, even if cr0 hasn't | ||
| 13905 | * actually changed, because vmx_set_cr0 refers to efer set above. | ||
| 13906 | * | ||
| 13907 | * CR0_GUEST_HOST_MASK is already set in the original vmcs01 | ||
| 13908 | * (KVM doesn't change it); | ||
| 13909 | */ | ||
| 13910 | vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS; | ||
| 13911 | vmx_set_cr0(vcpu, vmcs12->host_cr0); | ||
| 13912 | |||
| 13913 | /* Same as above - no reason to call set_cr4_guest_host_mask(). */ | ||
| 13914 | vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); | ||
| 13915 | vmx_set_cr4(vcpu, vmcs12->host_cr4); | ||
| 13916 | |||
| 13917 | nested_ept_uninit_mmu_context(vcpu); | ||
| 13918 | |||
| 13919 | /* | ||
| 13920 | * Only PDPTE load can fail as the value of cr3 was checked on entry and | ||
| 13921 | * couldn't have changed. | ||
| 13922 | */ | ||
| 13923 | if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &entry_failure_code)) | ||
| 13924 | nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL); | ||
| 13925 | |||
| 13926 | if (!enable_ept) | ||
| 13927 | vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault; | ||
| 13928 | |||
| 13929 | /* | ||
| 13930 | * If vmcs01 doesn't use VPID, CPU flushes TLB on every | ||
| 13931 | * VMEntry/VMExit. Thus, no need to flush TLB. | ||
| 13932 | * | ||
| 13933 | * If vmcs12 doesn't use VPID, L1 expects TLB to be | ||
| 13934 | * flushed on every VMEntry/VMExit. | ||
| 13935 | * | ||
| 13936 | * Otherwise, we can preserve TLB entries as long as we are | ||
| 13937 | * able to tag L1 TLB entries differently than L2 TLB entries. | ||
| 13938 | * | ||
| 13939 | * If vmcs12 uses EPT, we need to execute this flush on EPTP01 | ||
| 13940 | * and therefore we request the TLB flush to happen only after VMCS EPTP | ||
| 13941 | * has been set by KVM_REQ_LOAD_CR3. | ||
| 13942 | */ | ||
| 13943 | if (enable_vpid && | ||
| 13944 | (!nested_cpu_has_vpid(vmcs12) || !nested_has_guest_tlb_tag(vcpu))) { | ||
| 13945 | kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); | ||
| 13946 | } | ||
| 13947 | |||
| 13948 | vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs); | ||
| 13949 | vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp); | ||
| 13950 | vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip); | ||
| 13951 | vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base); | ||
| 13952 | vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base); | ||
| 13953 | vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF); | ||
| 13954 | vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF); | ||
| 13955 | |||
| 13956 | /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */ | ||
| 13957 | if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS) | ||
| 13958 | vmcs_write64(GUEST_BNDCFGS, 0); | ||
| 13959 | |||
| 13960 | if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) { | ||
| 13961 | vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat); | ||
| 13962 | vcpu->arch.pat = vmcs12->host_ia32_pat; | ||
| 13963 | } | ||
| 13964 | if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) | ||
| 13965 | vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL, | ||
| 13966 | vmcs12->host_ia32_perf_global_ctrl); | ||
| 13967 | |||
| 13968 | /* Set L1 segment info according to Intel SDM | ||
| 13969 | 27.5.2 Loading Host Segment and Descriptor-Table Registers */ | ||
| 13970 | seg = (struct kvm_segment) { | ||
| 13971 | .base = 0, | ||
| 13972 | .limit = 0xFFFFFFFF, | ||
| 13973 | .selector = vmcs12->host_cs_selector, | ||
| 13974 | .type = 11, | ||
| 13975 | .present = 1, | ||
| 13976 | .s = 1, | ||
| 13977 | .g = 1 | ||
| 13978 | }; | ||
| 13979 | if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) | ||
| 13980 | seg.l = 1; | ||
| 13981 | else | ||
| 13982 | seg.db = 1; | ||
| 13983 | vmx_set_segment(vcpu, &seg, VCPU_SREG_CS); | ||
| 13984 | seg = (struct kvm_segment) { | ||
| 13985 | .base = 0, | ||
| 13986 | .limit = 0xFFFFFFFF, | ||
| 13987 | .type = 3, | ||
| 13988 | .present = 1, | ||
| 13989 | .s = 1, | ||
| 13990 | .db = 1, | ||
| 13991 | .g = 1 | ||
| 13992 | }; | ||
| 13993 | seg.selector = vmcs12->host_ds_selector; | ||
| 13994 | vmx_set_segment(vcpu, &seg, VCPU_SREG_DS); | ||
| 13995 | seg.selector = vmcs12->host_es_selector; | ||
| 13996 | vmx_set_segment(vcpu, &seg, VCPU_SREG_ES); | ||
| 13997 | seg.selector = vmcs12->host_ss_selector; | ||
| 13998 | vmx_set_segment(vcpu, &seg, VCPU_SREG_SS); | ||
| 13999 | seg.selector = vmcs12->host_fs_selector; | ||
| 14000 | seg.base = vmcs12->host_fs_base; | ||
| 14001 | vmx_set_segment(vcpu, &seg, VCPU_SREG_FS); | ||
| 14002 | seg.selector = vmcs12->host_gs_selector; | ||
| 14003 | seg.base = vmcs12->host_gs_base; | ||
| 14004 | vmx_set_segment(vcpu, &seg, VCPU_SREG_GS); | ||
| 14005 | seg = (struct kvm_segment) { | ||
| 14006 | .base = vmcs12->host_tr_base, | ||
| 14007 | .limit = 0x67, | ||
| 14008 | .selector = vmcs12->host_tr_selector, | ||
| 14009 | .type = 11, | ||
| 14010 | .present = 1 | ||
| 14011 | }; | ||
| 14012 | vmx_set_segment(vcpu, &seg, VCPU_SREG_TR); | ||
| 14013 | |||
| 14014 | kvm_set_dr(vcpu, 7, 0x400); | ||
| 14015 | vmcs_write64(GUEST_IA32_DEBUGCTL, 0); | ||
| 14016 | |||
| 14017 | if (cpu_has_vmx_msr_bitmap()) | ||
| 14018 | vmx_update_msr_bitmap(vcpu); | ||
| 14019 | |||
| 14020 | if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr, | ||
| 14021 | vmcs12->vm_exit_msr_load_count)) | ||
| 14022 | nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); | ||
| 14023 | } | ||
| 14024 | |||
| 14025 | static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx) | ||
| 14026 | { | ||
| 14027 | struct shared_msr_entry *efer_msr; | ||
| 14028 | unsigned int i; | ||
| 14029 | |||
| 14030 | if (vm_entry_controls_get(vmx) & VM_ENTRY_LOAD_IA32_EFER) | ||
| 14031 | return vmcs_read64(GUEST_IA32_EFER); | ||
| 14032 | |||
| 14033 | if (cpu_has_load_ia32_efer) | ||
| 14034 | return host_efer; | ||
| 14035 | |||
| 14036 | for (i = 0; i < vmx->msr_autoload.guest.nr; ++i) { | ||
| 14037 | if (vmx->msr_autoload.guest.val[i].index == MSR_EFER) | ||
| 14038 | return vmx->msr_autoload.guest.val[i].value; | ||
| 14039 | } | ||
| 14040 | |||
| 14041 | efer_msr = find_msr_entry(vmx, MSR_EFER); | ||
| 14042 | if (efer_msr) | ||
| 14043 | return efer_msr->data; | ||
| 14044 | |||
| 14045 | return host_efer; | ||
| 14046 | } | ||
| 14047 | |||
| 14048 | static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu) | ||
| 14049 | { | ||
| 14050 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); | ||
| 14051 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 14052 | struct vmx_msr_entry g, h; | ||
| 14053 | struct msr_data msr; | ||
| 14054 | gpa_t gpa; | ||
| 14055 | u32 i, j; | ||
| 14056 | |||
| 14057 | vcpu->arch.pat = vmcs_read64(GUEST_IA32_PAT); | ||
| 14058 | |||
| 14059 | if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) { | ||
| 14060 | /* | ||
| 14061 | * L1's host DR7 is lost if KVM_GUESTDBG_USE_HW_BP is set | ||
| 14062 | * as vmcs01.GUEST_DR7 contains a userspace defined value | ||
| 14063 | * and vcpu->arch.dr7 is not squirreled away before the | ||
| 14064 | * nested VMENTER (not worth adding a variable in nested_vmx). | ||
| 14065 | */ | ||
| 14066 | if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) | ||
| 14067 | kvm_set_dr(vcpu, 7, DR7_FIXED_1); | ||
| 14068 | else | ||
| 14069 | WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7))); | ||
| 14070 | } | ||
| 14071 | |||
| 14072 | /* | ||
| 14073 | * Note that calling vmx_set_{efer,cr0,cr4} is important as they | ||
| 14074 | * handle a variety of side effects to KVM's software model. | ||
| 14075 | */ | ||
| 14076 | vmx_set_efer(vcpu, nested_vmx_get_vmcs01_guest_efer(vmx)); | ||
| 14077 | |||
| 14078 | vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS; | ||
| 14079 | vmx_set_cr0(vcpu, vmcs_readl(CR0_READ_SHADOW)); | ||
| 14080 | |||
| 14081 | vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); | ||
| 14082 | vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW)); | ||
| 14083 | |||
| 14084 | nested_ept_uninit_mmu_context(vcpu); | ||
| 14085 | vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); | ||
| 14086 | __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); | ||
| 14087 | |||
| 14088 | /* | ||
| 14089 | * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs | ||
| 14090 | * from vmcs01 (if necessary). The PDPTRs are not loaded on | ||
| 14091 | * VMFail, like everything else we just need to ensure our | ||
| 14092 | * software model is up-to-date. | ||
| 14093 | */ | ||
| 14094 | ept_save_pdptrs(vcpu); | ||
| 14095 | |||
| 14096 | kvm_mmu_reset_context(vcpu); | ||
| 14097 | |||
| 14098 | if (cpu_has_vmx_msr_bitmap()) | ||
| 14099 | vmx_update_msr_bitmap(vcpu); | ||
| 14100 | |||
| 14101 | /* | ||
| 14102 | * This nasty bit of open coding is a compromise between blindly | ||
| 14103 | * loading L1's MSRs using the exit load lists (incorrect emulation | ||
| 14104 | * of VMFail), leaving the nested VM's MSRs in the software model | ||
| 14105 | * (incorrect behavior) and snapshotting the modified MSRs (too | ||
| 14106 | * expensive since the lists are unbound by hardware). For each | ||
| 14107 | * MSR that was (prematurely) loaded from the nested VMEntry load | ||
| 14108 | * list, reload it from the exit load list if it exists and differs | ||
| 14109 | * from the guest value. The intent is to stuff host state as | ||
| 14110 | * silently as possible, not to fully process the exit load list. | ||
| 14111 | */ | ||
| 14112 | msr.host_initiated = false; | ||
| 14113 | for (i = 0; i < vmcs12->vm_entry_msr_load_count; i++) { | ||
| 14114 | gpa = vmcs12->vm_entry_msr_load_addr + (i * sizeof(g)); | ||
| 14115 | if (kvm_vcpu_read_guest(vcpu, gpa, &g, sizeof(g))) { | ||
| 14116 | pr_debug_ratelimited( | ||
| 14117 | "%s read MSR index failed (%u, 0x%08llx)\n", | ||
| 14118 | __func__, i, gpa); | ||
| 14119 | goto vmabort; | ||
| 14120 | } | ||
| 14121 | |||
| 14122 | for (j = 0; j < vmcs12->vm_exit_msr_load_count; j++) { | ||
| 14123 | gpa = vmcs12->vm_exit_msr_load_addr + (j * sizeof(h)); | ||
| 14124 | if (kvm_vcpu_read_guest(vcpu, gpa, &h, sizeof(h))) { | ||
| 14125 | pr_debug_ratelimited( | ||
| 14126 | "%s read MSR failed (%u, 0x%08llx)\n", | ||
| 14127 | __func__, j, gpa); | ||
| 14128 | goto vmabort; | ||
| 14129 | } | ||
| 14130 | if (h.index != g.index) | ||
| 14131 | continue; | ||
| 14132 | if (h.value == g.value) | ||
| 14133 | break; | ||
| 14134 | |||
| 14135 | if (nested_vmx_load_msr_check(vcpu, &h)) { | ||
| 14136 | pr_debug_ratelimited( | ||
| 14137 | "%s check failed (%u, 0x%x, 0x%x)\n", | ||
| 14138 | __func__, j, h.index, h.reserved); | ||
| 14139 | goto vmabort; | ||
| 14140 | } | ||
| 14141 | |||
| 14142 | msr.index = h.index; | ||
| 14143 | msr.data = h.value; | ||
| 14144 | if (kvm_set_msr(vcpu, &msr)) { | ||
| 14145 | pr_debug_ratelimited( | ||
| 14146 | "%s WRMSR failed (%u, 0x%x, 0x%llx)\n", | ||
| 14147 | __func__, j, h.index, h.value); | ||
| 14148 | goto vmabort; | ||
| 14149 | } | ||
| 14150 | } | ||
| 14151 | } | ||
| 14152 | |||
| 14153 | return; | ||
| 14154 | |||
| 14155 | vmabort: | ||
| 14156 | nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); | ||
| 14157 | } | ||
| 14158 | |||
| 14159 | /* | ||
| 14160 | * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1 | ||
| 14161 | * and modify vmcs12 to make it see what it would expect to see there if | ||
| 14162 | * L2 was its real guest. Must only be called when in L2 (is_guest_mode()) | ||
| 14163 | */ | ||
| 14164 | static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, | ||
| 14165 | u32 exit_intr_info, | ||
| 14166 | unsigned long exit_qualification) | ||
| 14167 | { | ||
| 14168 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 14169 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); | ||
| 14170 | |||
| 14171 | /* trying to cancel vmlaunch/vmresume is a bug */ | ||
| 14172 | WARN_ON_ONCE(vmx->nested.nested_run_pending); | ||
| 14173 | |||
| 14174 | leave_guest_mode(vcpu); | ||
| 14175 | |||
| 14176 | if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) | ||
| 14177 | vcpu->arch.tsc_offset -= vmcs12->tsc_offset; | ||
| 14178 | |||
| 14179 | if (likely(!vmx->fail)) { | ||
| 14180 | if (exit_reason == -1) | ||
| 14181 | sync_vmcs12(vcpu, vmcs12); | ||
| 14182 | else | ||
| 14183 | prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info, | ||
| 14184 | exit_qualification); | ||
| 14185 | |||
| 14186 | /* | ||
| 14187 | * Must happen outside of sync_vmcs12() as it will | ||
| 14188 | * also be used to capture vmcs12 cache as part of | ||
| 14189 | * capturing nVMX state for snapshot (migration). | ||
| 14190 | * | ||
| 14191 | * Otherwise, this flush will dirty guest memory at a | ||
| 14192 | * point it is already assumed by user-space to be | ||
| 14193 | * immutable. | ||
| 14194 | */ | ||
| 14195 | nested_flush_cached_shadow_vmcs12(vcpu, vmcs12); | ||
| 14196 | |||
| 14197 | if (nested_vmx_store_msr(vcpu, vmcs12->vm_exit_msr_store_addr, | ||
| 14198 | vmcs12->vm_exit_msr_store_count)) | ||
| 14199 | nested_vmx_abort(vcpu, VMX_ABORT_SAVE_GUEST_MSR_FAIL); | ||
| 14200 | } else { | ||
| 14201 | /* | ||
| 14202 | * The only expected VM-instruction error is "VM entry with | ||
| 14203 | * invalid control field(s)." Anything else indicates a | ||
| 14204 | * problem with L0. And we should never get here with a | ||
| 14205 | * VMFail of any type if early consistency checks are enabled. | ||
| 14206 | */ | ||
| 14207 | WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) != | ||
| 14208 | VMXERR_ENTRY_INVALID_CONTROL_FIELD); | ||
| 14209 | WARN_ON_ONCE(nested_early_check); | ||
| 14210 | } | ||
| 14211 | |||
| 14212 | vmx_switch_vmcs(vcpu, &vmx->vmcs01); | ||
| 14213 | |||
| 14214 | /* Update any VMCS fields that might have changed while L2 ran */ | ||
| 14215 | vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); | ||
| 14216 | vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); | ||
| 14217 | vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); | ||
| 14218 | |||
| 14219 | if (kvm_has_tsc_control) | ||
| 14220 | decache_tsc_multiplier(vmx); | ||
| 14221 | |||
| 14222 | if (vmx->nested.change_vmcs01_virtual_apic_mode) { | ||
| 14223 | vmx->nested.change_vmcs01_virtual_apic_mode = false; | ||
| 14224 | vmx_set_virtual_apic_mode(vcpu); | ||
| 14225 | } else if (!nested_cpu_has_ept(vmcs12) && | ||
| 14226 | nested_cpu_has2(vmcs12, | ||
| 14227 | SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { | ||
| 14228 | vmx_flush_tlb(vcpu, true); | ||
| 14229 | } | ||
| 14230 | |||
| 14231 | /* This is needed for same reason as it was needed in prepare_vmcs02 */ | ||
| 14232 | vmx->host_rsp = 0; | ||
| 14233 | |||
| 14234 | /* Unpin physical memory we referred to in vmcs02 */ | ||
| 14235 | if (vmx->nested.apic_access_page) { | ||
| 14236 | kvm_release_page_dirty(vmx->nested.apic_access_page); | ||
| 14237 | vmx->nested.apic_access_page = NULL; | ||
| 14238 | } | ||
| 14239 | if (vmx->nested.virtual_apic_page) { | ||
| 14240 | kvm_release_page_dirty(vmx->nested.virtual_apic_page); | ||
| 14241 | vmx->nested.virtual_apic_page = NULL; | ||
| 14242 | } | ||
| 14243 | if (vmx->nested.pi_desc_page) { | ||
| 14244 | kunmap(vmx->nested.pi_desc_page); | ||
| 14245 | kvm_release_page_dirty(vmx->nested.pi_desc_page); | ||
| 14246 | vmx->nested.pi_desc_page = NULL; | ||
| 14247 | vmx->nested.pi_desc = NULL; | ||
| 14248 | } | ||
| 14249 | |||
| 14250 | /* | ||
| 14251 | * We are now running in L2, mmu_notifier will force to reload the | ||
| 14252 | * page's hpa for L2 vmcs. Need to reload it for L1 before entering L1. | ||
| 14253 | */ | ||
| 14254 | kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); | ||
| 14255 | |||
| 14256 | if ((exit_reason != -1) && (enable_shadow_vmcs || vmx->nested.hv_evmcs)) | ||
| 14257 | vmx->nested.need_vmcs12_sync = true; | ||
| 14258 | |||
| 14259 | /* in case we halted in L2 */ | ||
| 14260 | vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; | ||
| 14261 | |||
| 14262 | if (likely(!vmx->fail)) { | ||
| 14263 | /* | ||
| 14264 | * TODO: SDM says that with acknowledge interrupt on | ||
| 14265 | * exit, bit 31 of the VM-exit interrupt information | ||
| 14266 | * (valid interrupt) is always set to 1 on | ||
| 14267 | * EXIT_REASON_EXTERNAL_INTERRUPT, so we shouldn't | ||
| 14268 | * need kvm_cpu_has_interrupt(). See the commit | ||
| 14269 | * message for details. | ||
| 14270 | */ | ||
| 14271 | if (nested_exit_intr_ack_set(vcpu) && | ||
| 14272 | exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT && | ||
| 14273 | kvm_cpu_has_interrupt(vcpu)) { | ||
| 14274 | int irq = kvm_cpu_get_interrupt(vcpu); | ||
| 14275 | WARN_ON(irq < 0); | ||
| 14276 | vmcs12->vm_exit_intr_info = irq | | ||
| 14277 | INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR; | ||
| 14278 | } | ||
| 14279 | |||
| 14280 | if (exit_reason != -1) | ||
| 14281 | trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason, | ||
| 14282 | vmcs12->exit_qualification, | ||
| 14283 | vmcs12->idt_vectoring_info_field, | ||
| 14284 | vmcs12->vm_exit_intr_info, | ||
| 14285 | vmcs12->vm_exit_intr_error_code, | ||
| 14286 | KVM_ISA_VMX); | ||
| 14287 | |||
| 14288 | load_vmcs12_host_state(vcpu, vmcs12); | ||
| 14289 | |||
| 14290 | return; | ||
| 14291 | } | ||
| 14292 | |||
| 14293 | /* | ||
| 14294 | * After an early L2 VM-entry failure, we're now back | ||
| 14295 | * in L1 which thinks it just finished a VMLAUNCH or | ||
| 14296 | * VMRESUME instruction, so we need to set the failure | ||
| 14297 | * flag and the VM-instruction error field of the VMCS | ||
| 14298 | * accordingly, and skip the emulated instruction. | ||
| 14299 | */ | ||
| 14300 | (void)nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); | ||
| 14301 | |||
| 14302 | /* | ||
| 14303 | * Restore L1's host state to KVM's software model. We're here | ||
| 14304 | * because a consistency check was caught by hardware, which | ||
| 14305 | * means some amount of guest state has been propagated to KVM's | ||
| 14306 | * model and needs to be unwound to the host's state. | ||
| 14307 | */ | ||
| 14308 | nested_vmx_restore_host_state(vcpu); | ||
| 14309 | |||
| 14310 | vmx->fail = 0; | ||
| 14311 | } | ||
| 14312 | |||
| 14313 | /* | ||
| 14314 | * Forcibly leave nested mode in order to be able to reset the VCPU later on. | ||
| 14315 | */ | ||
| 14316 | static void vmx_leave_nested(struct kvm_vcpu *vcpu) | ||
| 14317 | { | ||
| 14318 | if (is_guest_mode(vcpu)) { | ||
| 14319 | to_vmx(vcpu)->nested.nested_run_pending = 0; | ||
| 14320 | nested_vmx_vmexit(vcpu, -1, 0, 0); | ||
| 14321 | } | ||
| 14322 | free_nested(vcpu); | ||
| 14323 | } | ||
| 14324 | |||
| 14325 | static int vmx_check_intercept(struct kvm_vcpu *vcpu, | ||
| 14326 | struct x86_instruction_info *info, | ||
| 14327 | enum x86_intercept_stage stage) | ||
| 14328 | { | ||
| 14329 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); | ||
| 14330 | struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; | ||
| 14331 | |||
| 14332 | /* | ||
| 14333 | * RDPID causes #UD if disabled through secondary execution controls. | ||
| 14334 | * Because it is marked as EmulateOnUD, we need to intercept it here. | ||
| 14335 | */ | ||
| 14336 | if (info->intercept == x86_intercept_rdtscp && | ||
| 14337 | !nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDTSCP)) { | ||
| 14338 | ctxt->exception.vector = UD_VECTOR; | ||
| 14339 | ctxt->exception.error_code_valid = false; | ||
| 14340 | return X86EMUL_PROPAGATE_FAULT; | ||
| 14341 | } | ||
| 14342 | |||
| 14343 | /* TODO: check more intercepts... */ | ||
| 14344 | return X86EMUL_CONTINUE; | ||
| 14345 | } | ||
| 14346 | |||
| 14347 | #ifdef CONFIG_X86_64 | ||
| 14348 | /* (a << shift) / divisor, return 1 if overflow otherwise 0 */ | ||
| 14349 | static inline int u64_shl_div_u64(u64 a, unsigned int shift, | ||
| 14350 | u64 divisor, u64 *result) | ||
| 14351 | { | ||
| 14352 | u64 low = a << shift, high = a >> (64 - shift); | ||
| 14353 | |||
| 14354 | /* To avoid the overflow on divq */ | ||
| 14355 | if (high >= divisor) | ||
| 14356 | return 1; | ||
| 14357 | |||
| 14358 | /* Low hold the result, high hold rem which is discarded */ | ||
| 14359 | asm("divq %2\n\t" : "=a" (low), "=d" (high) : | ||
| 14360 | "rm" (divisor), "0" (low), "1" (high)); | ||
| 14361 | *result = low; | ||
| 14362 | |||
| 14363 | return 0; | ||
| 14364 | } | ||
| 14365 | |||
| 14366 | static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc) | ||
| 14367 | { | ||
| 14368 | struct vcpu_vmx *vmx; | ||
| 14369 | u64 tscl, guest_tscl, delta_tsc, lapic_timer_advance_cycles; | ||
| 14370 | |||
| 14371 | if (kvm_mwait_in_guest(vcpu->kvm)) | ||
| 14372 | return -EOPNOTSUPP; | ||
| 14373 | |||
| 14374 | vmx = to_vmx(vcpu); | ||
| 14375 | tscl = rdtsc(); | ||
| 14376 | guest_tscl = kvm_read_l1_tsc(vcpu, tscl); | ||
| 14377 | delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl; | ||
| 14378 | lapic_timer_advance_cycles = nsec_to_cycles(vcpu, lapic_timer_advance_ns); | ||
| 14379 | |||
| 14380 | if (delta_tsc > lapic_timer_advance_cycles) | ||
| 14381 | delta_tsc -= lapic_timer_advance_cycles; | ||
| 14382 | else | ||
| 14383 | delta_tsc = 0; | ||
| 14384 | |||
| 14385 | /* Convert to host delta tsc if tsc scaling is enabled */ | ||
| 14386 | if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio && | ||
| 14387 | u64_shl_div_u64(delta_tsc, | ||
| 14388 | kvm_tsc_scaling_ratio_frac_bits, | ||
| 14389 | vcpu->arch.tsc_scaling_ratio, | ||
| 14390 | &delta_tsc)) | ||
| 14391 | return -ERANGE; | ||
| 14392 | |||
| 14393 | /* | ||
| 14394 | * If the delta tsc can't fit in the 32 bit after the multi shift, | ||
| 14395 | * we can't use the preemption timer. | ||
| 14396 | * It's possible that it fits on later vmentries, but checking | ||
| 14397 | * on every vmentry is costly so we just use an hrtimer. | ||
| 14398 | */ | ||
| 14399 | if (delta_tsc >> (cpu_preemption_timer_multi + 32)) | ||
| 14400 | return -ERANGE; | ||
| 14401 | |||
| 14402 | vmx->hv_deadline_tsc = tscl + delta_tsc; | ||
| 14403 | return delta_tsc == 0; | ||
| 14404 | } | ||
| 14405 | |||
| 14406 | static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu) | ||
| 14407 | { | ||
| 14408 | to_vmx(vcpu)->hv_deadline_tsc = -1; | ||
| 14409 | } | ||
| 14410 | #endif | ||
| 14411 | |||
| 14412 | static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu) | ||
| 14413 | { | ||
| 14414 | if (!kvm_pause_in_guest(vcpu->kvm)) | ||
| 14415 | shrink_ple_window(vcpu); | ||
| 14416 | } | ||
| 14417 | |||
| 14418 | static void vmx_slot_enable_log_dirty(struct kvm *kvm, | ||
| 14419 | struct kvm_memory_slot *slot) | ||
| 14420 | { | ||
| 14421 | kvm_mmu_slot_leaf_clear_dirty(kvm, slot); | ||
| 14422 | kvm_mmu_slot_largepage_remove_write_access(kvm, slot); | ||
| 14423 | } | ||
| 14424 | |||
| 14425 | static void vmx_slot_disable_log_dirty(struct kvm *kvm, | ||
| 14426 | struct kvm_memory_slot *slot) | ||
| 14427 | { | ||
| 14428 | kvm_mmu_slot_set_dirty(kvm, slot); | ||
| 14429 | } | ||
| 14430 | |||
| 14431 | static void vmx_flush_log_dirty(struct kvm *kvm) | ||
| 14432 | { | ||
| 14433 | kvm_flush_pml_buffers(kvm); | ||
| 14434 | } | ||
| 14435 | |||
| 14436 | static int vmx_write_pml_buffer(struct kvm_vcpu *vcpu) | ||
| 14437 | { | ||
| 14438 | struct vmcs12 *vmcs12; | ||
| 14439 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 14440 | gpa_t gpa; | ||
| 14441 | struct page *page = NULL; | ||
| 14442 | u64 *pml_address; | ||
| 14443 | |||
| 14444 | if (is_guest_mode(vcpu)) { | ||
| 14445 | WARN_ON_ONCE(vmx->nested.pml_full); | ||
| 14446 | |||
| 14447 | /* | ||
| 14448 | * Check if PML is enabled for the nested guest. | ||
| 14449 | * Whether eptp bit 6 is set is already checked | ||
| 14450 | * as part of A/D emulation. | ||
| 14451 | */ | ||
| 14452 | vmcs12 = get_vmcs12(vcpu); | ||
| 14453 | if (!nested_cpu_has_pml(vmcs12)) | ||
| 14454 | return 0; | ||
| 14455 | |||
| 14456 | if (vmcs12->guest_pml_index >= PML_ENTITY_NUM) { | ||
| 14457 | vmx->nested.pml_full = true; | ||
| 14458 | return 1; | ||
| 14459 | } | ||
| 14460 | |||
| 14461 | gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS) & ~0xFFFull; | ||
| 14462 | |||
| 14463 | page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->pml_address); | ||
| 14464 | if (is_error_page(page)) | ||
| 14465 | return 0; | ||
| 14466 | |||
| 14467 | pml_address = kmap(page); | ||
| 14468 | pml_address[vmcs12->guest_pml_index--] = gpa; | ||
| 14469 | kunmap(page); | ||
| 14470 | kvm_release_page_clean(page); | ||
| 14471 | } | ||
| 14472 | |||
| 14473 | return 0; | ||
| 14474 | } | ||
| 14475 | |||
| 14476 | static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm, | ||
| 14477 | struct kvm_memory_slot *memslot, | ||
| 14478 | gfn_t offset, unsigned long mask) | ||
| 14479 | { | ||
| 14480 | kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask); | ||
| 14481 | } | ||
| 14482 | |||
| 14483 | static void __pi_post_block(struct kvm_vcpu *vcpu) | ||
| 14484 | { | ||
| 14485 | struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); | ||
| 14486 | struct pi_desc old, new; | ||
| 14487 | unsigned int dest; | ||
| 14488 | |||
| 14489 | do { | ||
| 14490 | old.control = new.control = pi_desc->control; | ||
| 14491 | WARN(old.nv != POSTED_INTR_WAKEUP_VECTOR, | ||
| 14492 | "Wakeup handler not enabled while the VCPU is blocked\n"); | ||
| 14493 | |||
| 14494 | dest = cpu_physical_id(vcpu->cpu); | ||
| 14495 | |||
| 14496 | if (x2apic_enabled()) | ||
| 14497 | new.ndst = dest; | ||
| 14498 | else | ||
| 14499 | new.ndst = (dest << 8) & 0xFF00; | ||
| 14500 | |||
| 14501 | /* set 'NV' to 'notification vector' */ | ||
| 14502 | new.nv = POSTED_INTR_VECTOR; | ||
| 14503 | } while (cmpxchg64(&pi_desc->control, old.control, | ||
| 14504 | new.control) != old.control); | ||
| 14505 | |||
| 14506 | if (!WARN_ON_ONCE(vcpu->pre_pcpu == -1)) { | ||
| 14507 | spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); | ||
| 14508 | list_del(&vcpu->blocked_vcpu_list); | ||
| 14509 | spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); | ||
| 14510 | vcpu->pre_pcpu = -1; | ||
| 14511 | } | ||
| 14512 | } | ||
| 14513 | |||
| 14514 | /* | ||
| 14515 | * This routine does the following things for vCPU which is going | ||
| 14516 | * to be blocked if VT-d PI is enabled. | ||
| 14517 | * - Store the vCPU to the wakeup list, so when interrupts happen | ||
| 14518 | * we can find the right vCPU to wake up. | ||
| 14519 | * - Change the Posted-interrupt descriptor as below: | ||
| 14520 | * 'NDST' <-- vcpu->pre_pcpu | ||
| 14521 | * 'NV' <-- POSTED_INTR_WAKEUP_VECTOR | ||
| 14522 | * - If 'ON' is set during this process, which means at least one | ||
| 14523 | * interrupt is posted for this vCPU, we cannot block it, in | ||
| 14524 | * this case, return 1, otherwise, return 0. | ||
| 14525 | * | ||
| 14526 | */ | ||
| 14527 | static int pi_pre_block(struct kvm_vcpu *vcpu) | ||
| 14528 | { | ||
| 14529 | unsigned int dest; | ||
| 14530 | struct pi_desc old, new; | ||
| 14531 | struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); | ||
| 14532 | |||
| 14533 | if (!kvm_arch_has_assigned_device(vcpu->kvm) || | ||
| 14534 | !irq_remapping_cap(IRQ_POSTING_CAP) || | ||
| 14535 | !kvm_vcpu_apicv_active(vcpu)) | ||
| 14536 | return 0; | ||
| 14537 | |||
| 14538 | WARN_ON(irqs_disabled()); | ||
| 14539 | local_irq_disable(); | ||
| 14540 | if (!WARN_ON_ONCE(vcpu->pre_pcpu != -1)) { | ||
| 14541 | vcpu->pre_pcpu = vcpu->cpu; | ||
| 14542 | spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); | ||
| 14543 | list_add_tail(&vcpu->blocked_vcpu_list, | ||
| 14544 | &per_cpu(blocked_vcpu_on_cpu, | ||
| 14545 | vcpu->pre_pcpu)); | ||
| 14546 | spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); | ||
| 14547 | } | ||
| 14548 | |||
| 14549 | do { | ||
| 14550 | old.control = new.control = pi_desc->control; | ||
| 14551 | |||
| 14552 | WARN((pi_desc->sn == 1), | ||
| 14553 | "Warning: SN field of posted-interrupts " | ||
| 14554 | "is set before blocking\n"); | ||
| 14555 | |||
| 14556 | /* | ||
| 14557 | * Since vCPU can be preempted during this process, | ||
| 14558 | * vcpu->cpu could be different with pre_pcpu, we | ||
| 14559 | * need to set pre_pcpu as the destination of wakeup | ||
| 14560 | * notification event, then we can find the right vCPU | ||
| 14561 | * to wakeup in wakeup handler if interrupts happen | ||
| 14562 | * when the vCPU is in blocked state. | ||
| 14563 | */ | ||
| 14564 | dest = cpu_physical_id(vcpu->pre_pcpu); | ||
| 14565 | |||
| 14566 | if (x2apic_enabled()) | ||
| 14567 | new.ndst = dest; | ||
| 14568 | else | ||
| 14569 | new.ndst = (dest << 8) & 0xFF00; | ||
| 14570 | |||
| 14571 | /* set 'NV' to 'wakeup vector' */ | ||
| 14572 | new.nv = POSTED_INTR_WAKEUP_VECTOR; | ||
| 14573 | } while (cmpxchg64(&pi_desc->control, old.control, | ||
| 14574 | new.control) != old.control); | ||
| 14575 | |||
| 14576 | /* We should not block the vCPU if an interrupt is posted for it. */ | ||
| 14577 | if (pi_test_on(pi_desc) == 1) | ||
| 14578 | __pi_post_block(vcpu); | ||
| 14579 | |||
| 14580 | local_irq_enable(); | ||
| 14581 | return (vcpu->pre_pcpu == -1); | ||
| 14582 | } | ||
| 14583 | |||
| 14584 | static int vmx_pre_block(struct kvm_vcpu *vcpu) | ||
| 14585 | { | ||
| 14586 | if (pi_pre_block(vcpu)) | ||
| 14587 | return 1; | ||
| 14588 | |||
| 14589 | if (kvm_lapic_hv_timer_in_use(vcpu)) | ||
| 14590 | kvm_lapic_switch_to_sw_timer(vcpu); | ||
| 14591 | |||
| 14592 | return 0; | ||
| 14593 | } | ||
| 14594 | |||
| 14595 | static void pi_post_block(struct kvm_vcpu *vcpu) | ||
| 14596 | { | ||
| 14597 | if (vcpu->pre_pcpu == -1) | ||
| 14598 | return; | ||
| 14599 | |||
| 14600 | WARN_ON(irqs_disabled()); | ||
| 14601 | local_irq_disable(); | ||
| 14602 | __pi_post_block(vcpu); | ||
| 14603 | local_irq_enable(); | ||
| 14604 | } | ||
| 14605 | |||
| 14606 | static void vmx_post_block(struct kvm_vcpu *vcpu) | ||
| 14607 | { | ||
| 14608 | if (kvm_x86_ops->set_hv_timer) | ||
| 14609 | kvm_lapic_switch_to_hv_timer(vcpu); | ||
| 14610 | |||
| 14611 | pi_post_block(vcpu); | ||
| 14612 | } | ||
| 14613 | |||
| 14614 | /* | ||
| 14615 | * vmx_update_pi_irte - set IRTE for Posted-Interrupts | ||
| 14616 | * | ||
| 14617 | * @kvm: kvm | ||
| 14618 | * @host_irq: host irq of the interrupt | ||
| 14619 | * @guest_irq: gsi of the interrupt | ||
| 14620 | * @set: set or unset PI | ||
| 14621 | * returns 0 on success, < 0 on failure | ||
| 14622 | */ | ||
| 14623 | static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq, | ||
| 14624 | uint32_t guest_irq, bool set) | ||
| 14625 | { | ||
| 14626 | struct kvm_kernel_irq_routing_entry *e; | ||
| 14627 | struct kvm_irq_routing_table *irq_rt; | ||
| 14628 | struct kvm_lapic_irq irq; | ||
| 14629 | struct kvm_vcpu *vcpu; | ||
| 14630 | struct vcpu_data vcpu_info; | ||
| 14631 | int idx, ret = 0; | ||
| 14632 | |||
| 14633 | if (!kvm_arch_has_assigned_device(kvm) || | ||
| 14634 | !irq_remapping_cap(IRQ_POSTING_CAP) || | ||
| 14635 | !kvm_vcpu_apicv_active(kvm->vcpus[0])) | ||
| 14636 | return 0; | ||
| 14637 | |||
| 14638 | idx = srcu_read_lock(&kvm->irq_srcu); | ||
| 14639 | irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu); | ||
| 14640 | if (guest_irq >= irq_rt->nr_rt_entries || | ||
| 14641 | hlist_empty(&irq_rt->map[guest_irq])) { | ||
| 14642 | pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n", | ||
| 14643 | guest_irq, irq_rt->nr_rt_entries); | ||
| 14644 | goto out; | ||
| 14645 | } | ||
| 14646 | |||
| 14647 | hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) { | ||
| 14648 | if (e->type != KVM_IRQ_ROUTING_MSI) | ||
| 14649 | continue; | ||
| 14650 | /* | ||
| 14651 | * VT-d PI cannot support posting multicast/broadcast | ||
| 14652 | * interrupts to a vCPU, we still use interrupt remapping | ||
| 14653 | * for these kind of interrupts. | ||
| 14654 | * | ||
| 14655 | * For lowest-priority interrupts, we only support | ||
| 14656 | * those with single CPU as the destination, e.g. user | ||
| 14657 | * configures the interrupts via /proc/irq or uses | ||
| 14658 | * irqbalance to make the interrupts single-CPU. | ||
| 14659 | * | ||
| 14660 | * We will support full lowest-priority interrupt later. | ||
| 14661 | */ | ||
| 14662 | |||
| 14663 | kvm_set_msi_irq(kvm, e, &irq); | ||
| 14664 | if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) { | ||
| 14665 | /* | ||
| 14666 | * Make sure the IRTE is in remapped mode if | ||
| 14667 | * we don't handle it in posted mode. | ||
| 14668 | */ | ||
| 14669 | ret = irq_set_vcpu_affinity(host_irq, NULL); | ||
| 14670 | if (ret < 0) { | ||
| 14671 | printk(KERN_INFO | ||
| 14672 | "failed to back to remapped mode, irq: %u\n", | ||
| 14673 | host_irq); | ||
| 14674 | goto out; | ||
| 14675 | } | ||
| 14676 | |||
| 14677 | continue; | ||
| 14678 | } | ||
| 14679 | |||
| 14680 | vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu)); | ||
| 14681 | vcpu_info.vector = irq.vector; | ||
| 14682 | |||
| 14683 | trace_kvm_pi_irte_update(host_irq, vcpu->vcpu_id, e->gsi, | ||
| 14684 | vcpu_info.vector, vcpu_info.pi_desc_addr, set); | ||
| 14685 | |||
| 14686 | if (set) | ||
| 14687 | ret = irq_set_vcpu_affinity(host_irq, &vcpu_info); | ||
| 14688 | else | ||
| 14689 | ret = irq_set_vcpu_affinity(host_irq, NULL); | ||
| 14690 | |||
| 14691 | if (ret < 0) { | ||
| 14692 | printk(KERN_INFO "%s: failed to update PI IRTE\n", | ||
| 14693 | __func__); | ||
| 14694 | goto out; | ||
| 14695 | } | ||
| 14696 | } | ||
| 14697 | |||
| 14698 | ret = 0; | ||
| 14699 | out: | ||
| 14700 | srcu_read_unlock(&kvm->irq_srcu, idx); | ||
| 14701 | return ret; | ||
| 14702 | } | ||
| 14703 | |||
| 14704 | static void vmx_setup_mce(struct kvm_vcpu *vcpu) | ||
| 14705 | { | ||
| 14706 | if (vcpu->arch.mcg_cap & MCG_LMCE_P) | ||
| 14707 | to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |= | ||
| 14708 | FEATURE_CONTROL_LMCE; | ||
| 14709 | else | ||
| 14710 | to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &= | ||
| 14711 | ~FEATURE_CONTROL_LMCE; | ||
| 14712 | } | ||
| 14713 | |||
| 14714 | static int vmx_smi_allowed(struct kvm_vcpu *vcpu) | ||
| 14715 | { | ||
| 14716 | /* we need a nested vmexit to enter SMM, postpone if run is pending */ | ||
| 14717 | if (to_vmx(vcpu)->nested.nested_run_pending) | ||
| 14718 | return 0; | ||
| 14719 | return 1; | ||
| 14720 | } | ||
| 14721 | |||
| 14722 | static int vmx_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate) | ||
| 14723 | { | ||
| 14724 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 14725 | |||
| 14726 | vmx->nested.smm.guest_mode = is_guest_mode(vcpu); | ||
| 14727 | if (vmx->nested.smm.guest_mode) | ||
| 14728 | nested_vmx_vmexit(vcpu, -1, 0, 0); | ||
| 14729 | |||
| 14730 | vmx->nested.smm.vmxon = vmx->nested.vmxon; | ||
| 14731 | vmx->nested.vmxon = false; | ||
| 14732 | vmx_clear_hlt(vcpu); | ||
| 14733 | return 0; | ||
| 14734 | } | ||
| 14735 | |||
| 14736 | static int vmx_pre_leave_smm(struct kvm_vcpu *vcpu, u64 smbase) | ||
| 14737 | { | ||
| 14738 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 14739 | int ret; | ||
| 14740 | |||
| 14741 | if (vmx->nested.smm.vmxon) { | ||
| 14742 | vmx->nested.vmxon = true; | ||
| 14743 | vmx->nested.smm.vmxon = false; | ||
| 14744 | } | ||
| 14745 | |||
| 14746 | if (vmx->nested.smm.guest_mode) { | ||
| 14747 | vcpu->arch.hflags &= ~HF_SMM_MASK; | ||
| 14748 | ret = nested_vmx_enter_non_root_mode(vcpu, false); | ||
| 14749 | vcpu->arch.hflags |= HF_SMM_MASK; | ||
| 14750 | if (ret) | ||
| 14751 | return ret; | ||
| 14752 | |||
| 14753 | vmx->nested.smm.guest_mode = false; | ||
| 14754 | } | ||
| 14755 | return 0; | ||
| 14756 | } | ||
| 14757 | |||
| 14758 | static int enable_smi_window(struct kvm_vcpu *vcpu) | ||
| 14759 | { | ||
| 14760 | return 0; | ||
| 14761 | } | ||
| 14762 | |||
| 14763 | static inline int vmx_has_valid_vmcs12(struct kvm_vcpu *vcpu) | ||
| 14764 | { | ||
| 14765 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 14766 | |||
| 14767 | /* | ||
| 14768 | * In case we do two consecutive get/set_nested_state()s while L2 was | ||
| 14769 | * running hv_evmcs may end up not being mapped (we map it from | ||
| 14770 | * nested_vmx_run()/vmx_vcpu_run()). Check is_guest_mode() as we always | ||
| 14771 | * have vmcs12 if it is true. | ||
| 14772 | */ | ||
| 14773 | return is_guest_mode(vcpu) || vmx->nested.current_vmptr != -1ull || | ||
| 14774 | vmx->nested.hv_evmcs; | ||
| 14775 | } | ||
| 14776 | |||
| 14777 | static int vmx_get_nested_state(struct kvm_vcpu *vcpu, | ||
| 14778 | struct kvm_nested_state __user *user_kvm_nested_state, | ||
| 14779 | u32 user_data_size) | ||
| 14780 | { | ||
| 14781 | struct vcpu_vmx *vmx; | ||
| 14782 | struct vmcs12 *vmcs12; | ||
| 14783 | struct kvm_nested_state kvm_state = { | ||
| 14784 | .flags = 0, | ||
| 14785 | .format = 0, | ||
| 14786 | .size = sizeof(kvm_state), | ||
| 14787 | .vmx.vmxon_pa = -1ull, | ||
| 14788 | .vmx.vmcs_pa = -1ull, | ||
| 14789 | }; | ||
| 14790 | |||
| 14791 | if (!vcpu) | ||
| 14792 | return kvm_state.size + 2 * VMCS12_SIZE; | ||
| 14793 | |||
| 14794 | vmx = to_vmx(vcpu); | ||
| 14795 | vmcs12 = get_vmcs12(vcpu); | ||
| 14796 | |||
| 14797 | if (nested_vmx_allowed(vcpu) && vmx->nested.enlightened_vmcs_enabled) | ||
| 14798 | kvm_state.flags |= KVM_STATE_NESTED_EVMCS; | ||
| 14799 | |||
| 14800 | if (nested_vmx_allowed(vcpu) && | ||
| 14801 | (vmx->nested.vmxon || vmx->nested.smm.vmxon)) { | ||
| 14802 | kvm_state.vmx.vmxon_pa = vmx->nested.vmxon_ptr; | ||
| 14803 | kvm_state.vmx.vmcs_pa = vmx->nested.current_vmptr; | ||
| 14804 | |||
| 14805 | if (vmx_has_valid_vmcs12(vcpu)) { | ||
| 14806 | kvm_state.size += VMCS12_SIZE; | ||
| 14807 | |||
| 14808 | if (is_guest_mode(vcpu) && | ||
| 14809 | nested_cpu_has_shadow_vmcs(vmcs12) && | ||
| 14810 | vmcs12->vmcs_link_pointer != -1ull) | ||
| 14811 | kvm_state.size += VMCS12_SIZE; | ||
| 14812 | } | ||
| 14813 | |||
| 14814 | if (vmx->nested.smm.vmxon) | ||
| 14815 | kvm_state.vmx.smm.flags |= KVM_STATE_NESTED_SMM_VMXON; | ||
| 14816 | |||
| 14817 | if (vmx->nested.smm.guest_mode) | ||
| 14818 | kvm_state.vmx.smm.flags |= KVM_STATE_NESTED_SMM_GUEST_MODE; | ||
| 14819 | |||
| 14820 | if (is_guest_mode(vcpu)) { | ||
| 14821 | kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE; | ||
| 14822 | |||
| 14823 | if (vmx->nested.nested_run_pending) | ||
| 14824 | kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING; | ||
| 14825 | } | ||
| 14826 | } | ||
| 14827 | |||
| 14828 | if (user_data_size < kvm_state.size) | ||
| 14829 | goto out; | ||
| 14830 | |||
| 14831 | if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state))) | ||
| 14832 | return -EFAULT; | ||
| 14833 | |||
| 14834 | if (!vmx_has_valid_vmcs12(vcpu)) | ||
| 14835 | goto out; | ||
| 14836 | |||
| 14837 | /* | ||
| 14838 | * When running L2, the authoritative vmcs12 state is in the | ||
| 14839 | * vmcs02. When running L1, the authoritative vmcs12 state is | ||
| 14840 | * in the shadow or enlightened vmcs linked to vmcs01, unless | ||
| 14841 | * need_vmcs12_sync is set, in which case, the authoritative | ||
| 14842 | * vmcs12 state is in the vmcs12 already. | ||
| 14843 | */ | ||
| 14844 | if (is_guest_mode(vcpu)) { | ||
| 14845 | sync_vmcs12(vcpu, vmcs12); | ||
| 14846 | } else if (!vmx->nested.need_vmcs12_sync) { | ||
| 14847 | if (vmx->nested.hv_evmcs) | ||
| 14848 | copy_enlightened_to_vmcs12(vmx); | ||
| 14849 | else if (enable_shadow_vmcs) | ||
| 14850 | copy_shadow_to_vmcs12(vmx); | ||
| 14851 | } | ||
| 14852 | |||
| 14853 | if (copy_to_user(user_kvm_nested_state->data, vmcs12, sizeof(*vmcs12))) | ||
| 14854 | return -EFAULT; | ||
| 14855 | |||
| 14856 | if (nested_cpu_has_shadow_vmcs(vmcs12) && | ||
| 14857 | vmcs12->vmcs_link_pointer != -1ull) { | ||
| 14858 | if (copy_to_user(user_kvm_nested_state->data + VMCS12_SIZE, | ||
| 14859 | get_shadow_vmcs12(vcpu), sizeof(*vmcs12))) | ||
| 14860 | return -EFAULT; | ||
| 14861 | } | ||
| 14862 | |||
| 14863 | out: | ||
| 14864 | return kvm_state.size; | ||
| 14865 | } | ||
| 14866 | |||
| 14867 | static int vmx_set_nested_state(struct kvm_vcpu *vcpu, | ||
| 14868 | struct kvm_nested_state __user *user_kvm_nested_state, | ||
| 14869 | struct kvm_nested_state *kvm_state) | ||
| 14870 | { | ||
| 14871 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 14872 | struct vmcs12 *vmcs12; | ||
| 14873 | u32 exit_qual; | ||
| 14874 | int ret; | ||
| 14875 | |||
| 14876 | if (kvm_state->format != 0) | ||
| 14877 | return -EINVAL; | ||
| 14878 | |||
| 14879 | if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) | ||
| 14880 | nested_enable_evmcs(vcpu, NULL); | ||
| 14881 | |||
| 14882 | if (!nested_vmx_allowed(vcpu)) | ||
| 14883 | return kvm_state->vmx.vmxon_pa == -1ull ? 0 : -EINVAL; | ||
| 14884 | |||
| 14885 | if (kvm_state->vmx.vmxon_pa == -1ull) { | ||
| 14886 | if (kvm_state->vmx.smm.flags) | ||
| 14887 | return -EINVAL; | ||
| 14888 | |||
| 14889 | if (kvm_state->vmx.vmcs_pa != -1ull) | ||
| 14890 | return -EINVAL; | ||
| 14891 | |||
| 14892 | vmx_leave_nested(vcpu); | ||
| 14893 | return 0; | ||
| 14894 | } | ||
| 14895 | |||
| 14896 | if (!page_address_valid(vcpu, kvm_state->vmx.vmxon_pa)) | ||
| 14897 | return -EINVAL; | ||
| 14898 | |||
| 14899 | if ((kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && | ||
| 14900 | (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) | ||
| 14901 | return -EINVAL; | ||
| 14902 | |||
| 14903 | if (kvm_state->vmx.smm.flags & | ||
| 14904 | ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON)) | ||
| 14905 | return -EINVAL; | ||
| 14906 | |||
| 14907 | /* | ||
| 14908 | * SMM temporarily disables VMX, so we cannot be in guest mode, | ||
| 14909 | * nor can VMLAUNCH/VMRESUME be pending. Outside SMM, SMM flags | ||
| 14910 | * must be zero. | ||
| 14911 | */ | ||
| 14912 | if (is_smm(vcpu) ? kvm_state->flags : kvm_state->vmx.smm.flags) | ||
| 14913 | return -EINVAL; | ||
| 14914 | |||
| 14915 | if ((kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && | ||
| 14916 | !(kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON)) | ||
| 14917 | return -EINVAL; | ||
| 14918 | |||
| 14919 | vmx_leave_nested(vcpu); | ||
| 14920 | if (kvm_state->vmx.vmxon_pa == -1ull) | ||
| 14921 | return 0; | ||
| 14922 | |||
| 14923 | vmx->nested.vmxon_ptr = kvm_state->vmx.vmxon_pa; | ||
| 14924 | ret = enter_vmx_operation(vcpu); | ||
| 14925 | if (ret) | ||
| 14926 | return ret; | ||
| 14927 | |||
| 14928 | /* Empty 'VMXON' state is permitted */ | ||
| 14929 | if (kvm_state->size < sizeof(kvm_state) + sizeof(*vmcs12)) | ||
| 14930 | return 0; | ||
| 14931 | |||
| 14932 | if (kvm_state->vmx.vmcs_pa != -1ull) { | ||
| 14933 | if (kvm_state->vmx.vmcs_pa == kvm_state->vmx.vmxon_pa || | ||
| 14934 | !page_address_valid(vcpu, kvm_state->vmx.vmcs_pa)) | ||
| 14935 | return -EINVAL; | ||
| 14936 | |||
| 14937 | set_current_vmptr(vmx, kvm_state->vmx.vmcs_pa); | ||
| 14938 | } else if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) { | ||
| 14939 | /* | ||
| 14940 | * Sync eVMCS upon entry as we may not have | ||
| 14941 | * HV_X64_MSR_VP_ASSIST_PAGE set up yet. | ||
| 14942 | */ | ||
| 14943 | vmx->nested.need_vmcs12_sync = true; | ||
| 14944 | } else { | ||
| 14945 | return -EINVAL; | ||
| 14946 | } | ||
| 14947 | |||
| 14948 | if (kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON) { | ||
| 14949 | vmx->nested.smm.vmxon = true; | ||
| 14950 | vmx->nested.vmxon = false; | ||
| 14951 | |||
| 14952 | if (kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) | ||
| 14953 | vmx->nested.smm.guest_mode = true; | ||
| 14954 | } | ||
| 14955 | |||
| 14956 | vmcs12 = get_vmcs12(vcpu); | ||
| 14957 | if (copy_from_user(vmcs12, user_kvm_nested_state->data, sizeof(*vmcs12))) | ||
| 14958 | return -EFAULT; | ||
| 14959 | |||
| 14960 | if (vmcs12->hdr.revision_id != VMCS12_REVISION) | ||
| 14961 | return -EINVAL; | ||
| 14962 | |||
| 14963 | if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) | ||
| 14964 | return 0; | ||
| 14965 | |||
| 14966 | vmx->nested.nested_run_pending = | ||
| 14967 | !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING); | ||
| 14968 | |||
| 14969 | if (nested_cpu_has_shadow_vmcs(vmcs12) && | ||
| 14970 | vmcs12->vmcs_link_pointer != -1ull) { | ||
| 14971 | struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu); | ||
| 14972 | if (kvm_state->size < sizeof(kvm_state) + 2 * sizeof(*vmcs12)) | ||
| 14973 | return -EINVAL; | ||
| 14974 | |||
| 14975 | if (copy_from_user(shadow_vmcs12, | ||
| 14976 | user_kvm_nested_state->data + VMCS12_SIZE, | ||
| 14977 | sizeof(*vmcs12))) | ||
| 14978 | return -EFAULT; | ||
| 14979 | |||
| 14980 | if (shadow_vmcs12->hdr.revision_id != VMCS12_REVISION || | ||
| 14981 | !shadow_vmcs12->hdr.shadow_vmcs) | ||
| 14982 | return -EINVAL; | ||
| 14983 | } | ||
| 14984 | |||
| 14985 | if (check_vmentry_prereqs(vcpu, vmcs12) || | ||
| 14986 | check_vmentry_postreqs(vcpu, vmcs12, &exit_qual)) | ||
| 14987 | return -EINVAL; | ||
| 14988 | |||
| 14989 | vmx->nested.dirty_vmcs12 = true; | ||
| 14990 | ret = nested_vmx_enter_non_root_mode(vcpu, false); | ||
| 14991 | if (ret) | ||
| 14992 | return -EINVAL; | ||
| 14993 | |||
| 14994 | return 0; | ||
| 14995 | } | ||
| 14996 | |||
| 14997 | static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { | ||
| 14998 | .cpu_has_kvm_support = cpu_has_kvm_support, | ||
| 14999 | .disabled_by_bios = vmx_disabled_by_bios, | ||
| 15000 | .hardware_setup = hardware_setup, | ||
| 15001 | .hardware_unsetup = hardware_unsetup, | ||
| 15002 | .check_processor_compatibility = vmx_check_processor_compat, | ||
| 15003 | .hardware_enable = hardware_enable, | ||
| 15004 | .hardware_disable = hardware_disable, | ||
| 15005 | .cpu_has_accelerated_tpr = report_flexpriority, | ||
| 15006 | .has_emulated_msr = vmx_has_emulated_msr, | ||
| 15007 | |||
| 15008 | .vm_init = vmx_vm_init, | ||
| 15009 | .vm_alloc = vmx_vm_alloc, | ||
| 15010 | .vm_free = vmx_vm_free, | ||
| 15011 | |||
| 15012 | .vcpu_create = vmx_create_vcpu, | ||
| 15013 | .vcpu_free = vmx_free_vcpu, | ||
| 15014 | .vcpu_reset = vmx_vcpu_reset, | ||
| 15015 | |||
| 15016 | .prepare_guest_switch = vmx_prepare_switch_to_guest, | ||
| 15017 | .vcpu_load = vmx_vcpu_load, | ||
| 15018 | .vcpu_put = vmx_vcpu_put, | ||
| 15019 | |||
| 15020 | .update_bp_intercept = update_exception_bitmap, | ||
| 15021 | .get_msr_feature = vmx_get_msr_feature, | ||
| 15022 | .get_msr = vmx_get_msr, | ||
| 15023 | .set_msr = vmx_set_msr, | ||
| 15024 | .get_segment_base = vmx_get_segment_base, | ||
| 15025 | .get_segment = vmx_get_segment, | ||
| 15026 | .set_segment = vmx_set_segment, | ||
| 15027 | .get_cpl = vmx_get_cpl, | ||
| 15028 | .get_cs_db_l_bits = vmx_get_cs_db_l_bits, | ||
| 15029 | .decache_cr0_guest_bits = vmx_decache_cr0_guest_bits, | ||
| 15030 | .decache_cr3 = vmx_decache_cr3, | ||
| 15031 | .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits, | ||
| 15032 | .set_cr0 = vmx_set_cr0, | ||
| 15033 | .set_cr3 = vmx_set_cr3, | ||
| 15034 | .set_cr4 = vmx_set_cr4, | ||
| 15035 | .set_efer = vmx_set_efer, | ||
| 15036 | .get_idt = vmx_get_idt, | ||
| 15037 | .set_idt = vmx_set_idt, | ||
| 15038 | .get_gdt = vmx_get_gdt, | ||
| 15039 | .set_gdt = vmx_set_gdt, | ||
| 15040 | .get_dr6 = vmx_get_dr6, | ||
| 15041 | .set_dr6 = vmx_set_dr6, | ||
| 15042 | .set_dr7 = vmx_set_dr7, | ||
| 15043 | .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs, | ||
| 15044 | .cache_reg = vmx_cache_reg, | ||
| 15045 | .get_rflags = vmx_get_rflags, | ||
| 15046 | .set_rflags = vmx_set_rflags, | ||
| 15047 | |||
| 15048 | .tlb_flush = vmx_flush_tlb, | ||
| 15049 | .tlb_flush_gva = vmx_flush_tlb_gva, | ||
| 15050 | |||
| 15051 | .run = vmx_vcpu_run, | ||
| 15052 | .handle_exit = vmx_handle_exit, | ||
| 15053 | .skip_emulated_instruction = skip_emulated_instruction, | ||
| 15054 | .set_interrupt_shadow = vmx_set_interrupt_shadow, | ||
| 15055 | .get_interrupt_shadow = vmx_get_interrupt_shadow, | ||
| 15056 | .patch_hypercall = vmx_patch_hypercall, | ||
| 15057 | .set_irq = vmx_inject_irq, | ||
| 15058 | .set_nmi = vmx_inject_nmi, | ||
| 15059 | .queue_exception = vmx_queue_exception, | ||
| 15060 | .cancel_injection = vmx_cancel_injection, | ||
| 15061 | .interrupt_allowed = vmx_interrupt_allowed, | ||
| 15062 | .nmi_allowed = vmx_nmi_allowed, | ||
| 15063 | .get_nmi_mask = vmx_get_nmi_mask, | ||
| 15064 | .set_nmi_mask = vmx_set_nmi_mask, | ||
| 15065 | .enable_nmi_window = enable_nmi_window, | ||
| 15066 | .enable_irq_window = enable_irq_window, | ||
| 15067 | .update_cr8_intercept = update_cr8_intercept, | ||
| 15068 | .set_virtual_apic_mode = vmx_set_virtual_apic_mode, | ||
| 15069 | .set_apic_access_page_addr = vmx_set_apic_access_page_addr, | ||
| 15070 | .get_enable_apicv = vmx_get_enable_apicv, | ||
| 15071 | .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl, | ||
| 15072 | .load_eoi_exitmap = vmx_load_eoi_exitmap, | ||
| 15073 | .apicv_post_state_restore = vmx_apicv_post_state_restore, | ||
| 15074 | .hwapic_irr_update = vmx_hwapic_irr_update, | ||
| 15075 | .hwapic_isr_update = vmx_hwapic_isr_update, | ||
| 15076 | .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt, | ||
| 15077 | .sync_pir_to_irr = vmx_sync_pir_to_irr, | ||
| 15078 | .deliver_posted_interrupt = vmx_deliver_posted_interrupt, | ||
| 15079 | |||
| 15080 | .set_tss_addr = vmx_set_tss_addr, | ||
| 15081 | .set_identity_map_addr = vmx_set_identity_map_addr, | ||
| 15082 | .get_tdp_level = get_ept_level, | ||
| 15083 | .get_mt_mask = vmx_get_mt_mask, | ||
| 15084 | |||
| 15085 | .get_exit_info = vmx_get_exit_info, | ||
| 15086 | |||
| 15087 | .get_lpage_level = vmx_get_lpage_level, | ||
| 15088 | |||
| 15089 | .cpuid_update = vmx_cpuid_update, | ||
| 15090 | |||
| 15091 | .rdtscp_supported = vmx_rdtscp_supported, | ||
| 15092 | .invpcid_supported = vmx_invpcid_supported, | ||
| 15093 | |||
| 15094 | .set_supported_cpuid = vmx_set_supported_cpuid, | ||
| 15095 | |||
| 15096 | .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, | ||
| 15097 | |||
| 15098 | .read_l1_tsc_offset = vmx_read_l1_tsc_offset, | ||
| 15099 | .write_l1_tsc_offset = vmx_write_l1_tsc_offset, | ||
| 15100 | |||
| 15101 | .set_tdp_cr3 = vmx_set_cr3, | ||
| 15102 | |||
| 15103 | .check_intercept = vmx_check_intercept, | ||
| 15104 | .handle_external_intr = vmx_handle_external_intr, | ||
| 15105 | .mpx_supported = vmx_mpx_supported, | ||
| 15106 | .xsaves_supported = vmx_xsaves_supported, | ||
| 15107 | .umip_emulated = vmx_umip_emulated, | ||
| 15108 | |||
| 15109 | .check_nested_events = vmx_check_nested_events, | ||
| 15110 | .request_immediate_exit = vmx_request_immediate_exit, | ||
| 15111 | |||
| 15112 | .sched_in = vmx_sched_in, | ||
| 15113 | |||
| 15114 | .slot_enable_log_dirty = vmx_slot_enable_log_dirty, | ||
| 15115 | .slot_disable_log_dirty = vmx_slot_disable_log_dirty, | ||
| 15116 | .flush_log_dirty = vmx_flush_log_dirty, | ||
| 15117 | .enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked, | ||
| 15118 | .write_log_dirty = vmx_write_pml_buffer, | ||
| 15119 | |||
| 15120 | .pre_block = vmx_pre_block, | ||
| 15121 | .post_block = vmx_post_block, | ||
| 15122 | |||
| 15123 | .pmu_ops = &intel_pmu_ops, | ||
| 15124 | |||
| 15125 | .update_pi_irte = vmx_update_pi_irte, | ||
| 15126 | |||
| 15127 | #ifdef CONFIG_X86_64 | ||
| 15128 | .set_hv_timer = vmx_set_hv_timer, | ||
| 15129 | .cancel_hv_timer = vmx_cancel_hv_timer, | ||
| 15130 | #endif | ||
| 15131 | |||
| 15132 | .setup_mce = vmx_setup_mce, | ||
| 15133 | |||
| 15134 | .get_nested_state = vmx_get_nested_state, | ||
| 15135 | .set_nested_state = vmx_set_nested_state, | ||
| 15136 | .get_vmcs12_pages = nested_get_vmcs12_pages, | ||
| 15137 | |||
| 15138 | .smi_allowed = vmx_smi_allowed, | ||
| 15139 | .pre_enter_smm = vmx_pre_enter_smm, | ||
| 15140 | .pre_leave_smm = vmx_pre_leave_smm, | ||
| 15141 | .enable_smi_window = enable_smi_window, | ||
| 15142 | |||
| 15143 | .nested_enable_evmcs = nested_enable_evmcs, | ||
| 15144 | }; | ||
| 15145 | |||
| 15146 | static void vmx_cleanup_l1d_flush(void) | ||
| 15147 | { | ||
| 15148 | if (vmx_l1d_flush_pages) { | ||
| 15149 | free_pages((unsigned long)vmx_l1d_flush_pages, L1D_CACHE_ORDER); | ||
| 15150 | vmx_l1d_flush_pages = NULL; | ||
| 15151 | } | ||
| 15152 | /* Restore state so sysfs ignores VMX */ | ||
| 15153 | l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO; | ||
| 15154 | } | ||
| 15155 | |||
| 15156 | static void vmx_exit(void) | ||
| 15157 | { | ||
| 15158 | #ifdef CONFIG_KEXEC_CORE | ||
| 15159 | RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL); | ||
| 15160 | synchronize_rcu(); | ||
| 15161 | #endif | ||
| 15162 | |||
| 15163 | kvm_exit(); | ||
| 15164 | |||
| 15165 | #if IS_ENABLED(CONFIG_HYPERV) | ||
| 15166 | if (static_branch_unlikely(&enable_evmcs)) { | ||
| 15167 | int cpu; | ||
| 15168 | struct hv_vp_assist_page *vp_ap; | ||
| 15169 | /* | ||
| 15170 | * Reset everything to support using non-enlightened VMCS | ||
| 15171 | * access later (e.g. when we reload the module with | ||
| 15172 | * enlightened_vmcs=0) | ||
| 15173 | */ | ||
| 15174 | for_each_online_cpu(cpu) { | ||
| 15175 | vp_ap = hv_get_vp_assist_page(cpu); | ||
| 15176 | |||
| 15177 | if (!vp_ap) | ||
| 15178 | continue; | ||
| 15179 | |||
| 15180 | vp_ap->current_nested_vmcs = 0; | ||
| 15181 | vp_ap->enlighten_vmentry = 0; | ||
| 15182 | } | ||
| 15183 | |||
| 15184 | static_branch_disable(&enable_evmcs); | ||
| 15185 | } | ||
| 15186 | #endif | ||
| 15187 | vmx_cleanup_l1d_flush(); | ||
| 15188 | } | ||
| 15189 | module_exit(vmx_exit); | ||
| 15190 | |||
| 15191 | static int __init vmx_init(void) | ||
| 15192 | { | ||
| 15193 | int r; | ||
| 15194 | |||
| 15195 | #if IS_ENABLED(CONFIG_HYPERV) | ||
| 15196 | /* | ||
| 15197 | * Enlightened VMCS usage should be recommended and the host needs | ||
| 15198 | * to support eVMCS v1 or above. We can also disable eVMCS support | ||
| 15199 | * with module parameter. | ||
| 15200 | */ | ||
| 15201 | if (enlightened_vmcs && | ||
| 15202 | ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED && | ||
| 15203 | (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >= | ||
| 15204 | KVM_EVMCS_VERSION) { | ||
| 15205 | int cpu; | ||
| 15206 | |||
| 15207 | /* Check that we have assist pages on all online CPUs */ | ||
| 15208 | for_each_online_cpu(cpu) { | ||
| 15209 | if (!hv_get_vp_assist_page(cpu)) { | ||
| 15210 | enlightened_vmcs = false; | ||
| 15211 | break; | ||
| 15212 | } | ||
| 15213 | } | ||
| 15214 | |||
| 15215 | if (enlightened_vmcs) { | ||
| 15216 | pr_info("KVM: vmx: using Hyper-V Enlightened VMCS\n"); | ||
| 15217 | static_branch_enable(&enable_evmcs); | ||
| 15218 | } | ||
| 15219 | } else { | ||
| 15220 | enlightened_vmcs = false; | ||
| 15221 | } | ||
| 15222 | #endif | ||
| 15223 | |||
| 15224 | r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), | ||
| 15225 | __alignof__(struct vcpu_vmx), THIS_MODULE); | ||
| 15226 | if (r) | ||
| 15227 | return r; | ||
| 15228 | |||
| 15229 | /* | ||
| 15230 | * Must be called after kvm_init() so enable_ept is properly set | ||
| 15231 | * up. Hand the parameter mitigation value in which was stored in | ||
| 15232 | * the pre module init parser. If no parameter was given, it will | ||
| 15233 | * contain 'auto' which will be turned into the default 'cond' | ||
| 15234 | * mitigation mode. | ||
| 15235 | */ | ||
| 15236 | if (boot_cpu_has(X86_BUG_L1TF)) { | ||
| 15237 | r = vmx_setup_l1d_flush(vmentry_l1d_flush_param); | ||
| 15238 | if (r) { | ||
| 15239 | vmx_exit(); | ||
| 15240 | return r; | ||
| 15241 | } | ||
| 15242 | } | ||
| 15243 | |||
| 15244 | #ifdef CONFIG_KEXEC_CORE | ||
| 15245 | rcu_assign_pointer(crash_vmclear_loaded_vmcss, | ||
| 15246 | crash_vmclear_local_loaded_vmcss); | ||
| 15247 | #endif | ||
| 15248 | vmx_check_vmcs12_offsets(); | ||
| 15249 | |||
| 15250 | return 0; | ||
| 15251 | } | ||
| 15252 | module_init(vmx_init); | ||
diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h new file mode 100644 index 000000000000..854e144131c6 --- /dev/null +++ b/arch/x86/kvm/vmx/capabilities.h | |||
| @@ -0,0 +1,343 @@ | |||
| 1 | /* SPDX-License-Identifier: GPL-2.0 */ | ||
| 2 | #ifndef __KVM_X86_VMX_CAPS_H | ||
| 3 | #define __KVM_X86_VMX_CAPS_H | ||
| 4 | |||
| 5 | #include "lapic.h" | ||
| 6 | |||
| 7 | extern bool __read_mostly enable_vpid; | ||
| 8 | extern bool __read_mostly flexpriority_enabled; | ||
| 9 | extern bool __read_mostly enable_ept; | ||
| 10 | extern bool __read_mostly enable_unrestricted_guest; | ||
| 11 | extern bool __read_mostly enable_ept_ad_bits; | ||
| 12 | extern bool __read_mostly enable_pml; | ||
| 13 | extern int __read_mostly pt_mode; | ||
| 14 | |||
| 15 | #define PT_MODE_SYSTEM 0 | ||
| 16 | #define PT_MODE_HOST_GUEST 1 | ||
| 17 | |||
| 18 | struct nested_vmx_msrs { | ||
| 19 | /* | ||
| 20 | * We only store the "true" versions of the VMX capability MSRs. We | ||
| 21 | * generate the "non-true" versions by setting the must-be-1 bits | ||
| 22 | * according to the SDM. | ||
| 23 | */ | ||
| 24 | u32 procbased_ctls_low; | ||
| 25 | u32 procbased_ctls_high; | ||
| 26 | u32 secondary_ctls_low; | ||
| 27 | u32 secondary_ctls_high; | ||
| 28 | u32 pinbased_ctls_low; | ||
| 29 | u32 pinbased_ctls_high; | ||
| 30 | u32 exit_ctls_low; | ||
| 31 | u32 exit_ctls_high; | ||
| 32 | u32 entry_ctls_low; | ||
| 33 | u32 entry_ctls_high; | ||
| 34 | u32 misc_low; | ||
| 35 | u32 misc_high; | ||
| 36 | u32 ept_caps; | ||
| 37 | u32 vpid_caps; | ||
| 38 | u64 basic; | ||
| 39 | u64 cr0_fixed0; | ||
| 40 | u64 cr0_fixed1; | ||
| 41 | u64 cr4_fixed0; | ||
| 42 | u64 cr4_fixed1; | ||
| 43 | u64 vmcs_enum; | ||
| 44 | u64 vmfunc_controls; | ||
| 45 | }; | ||
| 46 | |||
| 47 | struct vmcs_config { | ||
| 48 | int size; | ||
| 49 | int order; | ||
| 50 | u32 basic_cap; | ||
| 51 | u32 revision_id; | ||
| 52 | u32 pin_based_exec_ctrl; | ||
| 53 | u32 cpu_based_exec_ctrl; | ||
| 54 | u32 cpu_based_2nd_exec_ctrl; | ||
| 55 | u32 vmexit_ctrl; | ||
| 56 | u32 vmentry_ctrl; | ||
| 57 | struct nested_vmx_msrs nested; | ||
| 58 | }; | ||
| 59 | extern struct vmcs_config vmcs_config; | ||
| 60 | |||
| 61 | struct vmx_capability { | ||
| 62 | u32 ept; | ||
| 63 | u32 vpid; | ||
| 64 | }; | ||
| 65 | extern struct vmx_capability vmx_capability; | ||
| 66 | |||
| 67 | static inline bool cpu_has_vmx_basic_inout(void) | ||
| 68 | { | ||
| 69 | return (((u64)vmcs_config.basic_cap << 32) & VMX_BASIC_INOUT); | ||
| 70 | } | ||
| 71 | |||
| 72 | static inline bool cpu_has_virtual_nmis(void) | ||
| 73 | { | ||
| 74 | return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS; | ||
| 75 | } | ||
| 76 | |||
| 77 | static inline bool cpu_has_vmx_preemption_timer(void) | ||
| 78 | { | ||
| 79 | return vmcs_config.pin_based_exec_ctrl & | ||
| 80 | PIN_BASED_VMX_PREEMPTION_TIMER; | ||
| 81 | } | ||
| 82 | |||
| 83 | static inline bool cpu_has_vmx_posted_intr(void) | ||
| 84 | { | ||
| 85 | return IS_ENABLED(CONFIG_X86_LOCAL_APIC) && | ||
| 86 | vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR; | ||
| 87 | } | ||
| 88 | |||
| 89 | static inline bool cpu_has_load_ia32_efer(void) | ||
| 90 | { | ||
| 91 | return (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_EFER) && | ||
| 92 | (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_EFER); | ||
| 93 | } | ||
| 94 | |||
| 95 | static inline bool cpu_has_load_perf_global_ctrl(void) | ||
| 96 | { | ||
| 97 | return (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && | ||
| 98 | (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL); | ||
| 99 | } | ||
| 100 | |||
| 101 | static inline bool vmx_mpx_supported(void) | ||
| 102 | { | ||
| 103 | return (vmcs_config.vmexit_ctrl & VM_EXIT_CLEAR_BNDCFGS) && | ||
| 104 | (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_BNDCFGS); | ||
| 105 | } | ||
| 106 | |||
| 107 | static inline bool cpu_has_vmx_tpr_shadow(void) | ||
| 108 | { | ||
| 109 | return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW; | ||
| 110 | } | ||
| 111 | |||
| 112 | static inline bool cpu_need_tpr_shadow(struct kvm_vcpu *vcpu) | ||
| 113 | { | ||
| 114 | return cpu_has_vmx_tpr_shadow() && lapic_in_kernel(vcpu); | ||
| 115 | } | ||
| 116 | |||
| 117 | static inline bool cpu_has_vmx_msr_bitmap(void) | ||
| 118 | { | ||
| 119 | return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS; | ||
| 120 | } | ||
| 121 | |||
| 122 | static inline bool cpu_has_secondary_exec_ctrls(void) | ||
| 123 | { | ||
| 124 | return vmcs_config.cpu_based_exec_ctrl & | ||
| 125 | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; | ||
| 126 | } | ||
| 127 | |||
| 128 | static inline bool cpu_has_vmx_virtualize_apic_accesses(void) | ||
| 129 | { | ||
| 130 | return vmcs_config.cpu_based_2nd_exec_ctrl & | ||
| 131 | SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; | ||
| 132 | } | ||
| 133 | |||
| 134 | static inline bool cpu_has_vmx_ept(void) | ||
| 135 | { | ||
| 136 | return vmcs_config.cpu_based_2nd_exec_ctrl & | ||
| 137 | SECONDARY_EXEC_ENABLE_EPT; | ||
| 138 | } | ||
| 139 | |||
| 140 | static inline bool vmx_umip_emulated(void) | ||
| 141 | { | ||
| 142 | return vmcs_config.cpu_based_2nd_exec_ctrl & | ||
| 143 | SECONDARY_EXEC_DESC; | ||
| 144 | } | ||
| 145 | |||
| 146 | static inline bool cpu_has_vmx_rdtscp(void) | ||
| 147 | { | ||
| 148 | return vmcs_config.cpu_based_2nd_exec_ctrl & | ||
| 149 | SECONDARY_EXEC_RDTSCP; | ||
| 150 | } | ||
| 151 | |||
| 152 | static inline bool cpu_has_vmx_virtualize_x2apic_mode(void) | ||
| 153 | { | ||
| 154 | return vmcs_config.cpu_based_2nd_exec_ctrl & | ||
| 155 | SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; | ||
| 156 | } | ||
| 157 | |||
| 158 | static inline bool cpu_has_vmx_vpid(void) | ||
| 159 | { | ||
| 160 | return vmcs_config.cpu_based_2nd_exec_ctrl & | ||
| 161 | SECONDARY_EXEC_ENABLE_VPID; | ||
| 162 | } | ||
| 163 | |||
| 164 | static inline bool cpu_has_vmx_wbinvd_exit(void) | ||
| 165 | { | ||
| 166 | return vmcs_config.cpu_based_2nd_exec_ctrl & | ||
| 167 | SECONDARY_EXEC_WBINVD_EXITING; | ||
| 168 | } | ||
| 169 | |||
| 170 | static inline bool cpu_has_vmx_unrestricted_guest(void) | ||
| 171 | { | ||
| 172 | return vmcs_config.cpu_based_2nd_exec_ctrl & | ||
| 173 | SECONDARY_EXEC_UNRESTRICTED_GUEST; | ||
| 174 | } | ||
| 175 | |||
| 176 | static inline bool cpu_has_vmx_apic_register_virt(void) | ||
| 177 | { | ||
| 178 | return vmcs_config.cpu_based_2nd_exec_ctrl & | ||
| 179 | SECONDARY_EXEC_APIC_REGISTER_VIRT; | ||
| 180 | } | ||
| 181 | |||
| 182 | static inline bool cpu_has_vmx_virtual_intr_delivery(void) | ||
| 183 | { | ||
| 184 | return vmcs_config.cpu_based_2nd_exec_ctrl & | ||
| 185 | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY; | ||
| 186 | } | ||
| 187 | |||
| 188 | static inline bool cpu_has_vmx_ple(void) | ||
| 189 | { | ||
| 190 | return vmcs_config.cpu_based_2nd_exec_ctrl & | ||
| 191 | SECONDARY_EXEC_PAUSE_LOOP_EXITING; | ||
| 192 | } | ||
| 193 | |||
| 194 | static inline bool vmx_rdrand_supported(void) | ||
| 195 | { | ||
| 196 | return vmcs_config.cpu_based_2nd_exec_ctrl & | ||
| 197 | SECONDARY_EXEC_RDRAND_EXITING; | ||
| 198 | } | ||
| 199 | |||
| 200 | static inline bool cpu_has_vmx_invpcid(void) | ||
| 201 | { | ||
| 202 | return vmcs_config.cpu_based_2nd_exec_ctrl & | ||
| 203 | SECONDARY_EXEC_ENABLE_INVPCID; | ||
| 204 | } | ||
| 205 | |||
| 206 | static inline bool cpu_has_vmx_vmfunc(void) | ||
| 207 | { | ||
| 208 | return vmcs_config.cpu_based_2nd_exec_ctrl & | ||
| 209 | SECONDARY_EXEC_ENABLE_VMFUNC; | ||
| 210 | } | ||
| 211 | |||
| 212 | static inline bool cpu_has_vmx_shadow_vmcs(void) | ||
| 213 | { | ||
| 214 | u64 vmx_msr; | ||
| 215 | |||
| 216 | /* check if the cpu supports writing r/o exit information fields */ | ||
| 217 | rdmsrl(MSR_IA32_VMX_MISC, vmx_msr); | ||
| 218 | if (!(vmx_msr & MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS)) | ||
| 219 | return false; | ||
| 220 | |||
| 221 | return vmcs_config.cpu_based_2nd_exec_ctrl & | ||
| 222 | SECONDARY_EXEC_SHADOW_VMCS; | ||
| 223 | } | ||
| 224 | |||
| 225 | static inline bool cpu_has_vmx_encls_vmexit(void) | ||
| 226 | { | ||
| 227 | return vmcs_config.cpu_based_2nd_exec_ctrl & | ||
| 228 | SECONDARY_EXEC_ENCLS_EXITING; | ||
| 229 | } | ||
| 230 | |||
| 231 | static inline bool vmx_rdseed_supported(void) | ||
| 232 | { | ||
| 233 | return vmcs_config.cpu_based_2nd_exec_ctrl & | ||
| 234 | SECONDARY_EXEC_RDSEED_EXITING; | ||
| 235 | } | ||
| 236 | |||
| 237 | static inline bool cpu_has_vmx_pml(void) | ||
| 238 | { | ||
| 239 | return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_ENABLE_PML; | ||
| 240 | } | ||
| 241 | |||
| 242 | static inline bool vmx_xsaves_supported(void) | ||
| 243 | { | ||
| 244 | return vmcs_config.cpu_based_2nd_exec_ctrl & | ||
| 245 | SECONDARY_EXEC_XSAVES; | ||
| 246 | } | ||
| 247 | |||
| 248 | static inline bool cpu_has_vmx_tsc_scaling(void) | ||
| 249 | { | ||
| 250 | return vmcs_config.cpu_based_2nd_exec_ctrl & | ||
| 251 | SECONDARY_EXEC_TSC_SCALING; | ||
| 252 | } | ||
| 253 | |||
| 254 | static inline bool cpu_has_vmx_apicv(void) | ||
| 255 | { | ||
| 256 | return cpu_has_vmx_apic_register_virt() && | ||
| 257 | cpu_has_vmx_virtual_intr_delivery() && | ||
| 258 | cpu_has_vmx_posted_intr(); | ||
| 259 | } | ||
| 260 | |||
| 261 | static inline bool cpu_has_vmx_flexpriority(void) | ||
| 262 | { | ||
| 263 | return cpu_has_vmx_tpr_shadow() && | ||
| 264 | cpu_has_vmx_virtualize_apic_accesses(); | ||
| 265 | } | ||
| 266 | |||
| 267 | static inline bool cpu_has_vmx_ept_execute_only(void) | ||
| 268 | { | ||
| 269 | return vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT; | ||
| 270 | } | ||
| 271 | |||
| 272 | static inline bool cpu_has_vmx_ept_4levels(void) | ||
| 273 | { | ||
| 274 | return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT; | ||
| 275 | } | ||
| 276 | |||
| 277 | static inline bool cpu_has_vmx_ept_5levels(void) | ||
| 278 | { | ||
| 279 | return vmx_capability.ept & VMX_EPT_PAGE_WALK_5_BIT; | ||
| 280 | } | ||
| 281 | |||
| 282 | static inline bool cpu_has_vmx_ept_mt_wb(void) | ||
| 283 | { | ||
| 284 | return vmx_capability.ept & VMX_EPTP_WB_BIT; | ||
| 285 | } | ||
| 286 | |||
| 287 | static inline bool cpu_has_vmx_ept_2m_page(void) | ||
| 288 | { | ||
| 289 | return vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT; | ||
| 290 | } | ||
| 291 | |||
| 292 | static inline bool cpu_has_vmx_ept_1g_page(void) | ||
| 293 | { | ||
| 294 | return vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT; | ||
| 295 | } | ||
| 296 | |||
| 297 | static inline bool cpu_has_vmx_ept_ad_bits(void) | ||
| 298 | { | ||
| 299 | return vmx_capability.ept & VMX_EPT_AD_BIT; | ||
| 300 | } | ||
| 301 | |||
| 302 | static inline bool cpu_has_vmx_invept_context(void) | ||
| 303 | { | ||
| 304 | return vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT; | ||
| 305 | } | ||
| 306 | |||
| 307 | static inline bool cpu_has_vmx_invept_global(void) | ||
| 308 | { | ||
| 309 | return vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT; | ||
| 310 | } | ||
| 311 | |||
| 312 | static inline bool cpu_has_vmx_invvpid(void) | ||
| 313 | { | ||
| 314 | return vmx_capability.vpid & VMX_VPID_INVVPID_BIT; | ||
| 315 | } | ||
| 316 | |||
| 317 | static inline bool cpu_has_vmx_invvpid_individual_addr(void) | ||
| 318 | { | ||
| 319 | return vmx_capability.vpid & VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT; | ||
| 320 | } | ||
| 321 | |||
| 322 | static inline bool cpu_has_vmx_invvpid_single(void) | ||
| 323 | { | ||
| 324 | return vmx_capability.vpid & VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT; | ||
| 325 | } | ||
| 326 | |||
| 327 | static inline bool cpu_has_vmx_invvpid_global(void) | ||
| 328 | { | ||
| 329 | return vmx_capability.vpid & VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT; | ||
| 330 | } | ||
| 331 | |||
| 332 | static inline bool cpu_has_vmx_intel_pt(void) | ||
| 333 | { | ||
| 334 | u64 vmx_msr; | ||
| 335 | |||
| 336 | rdmsrl(MSR_IA32_VMX_MISC, vmx_msr); | ||
| 337 | return (vmx_msr & MSR_IA32_VMX_MISC_INTEL_PT) && | ||
| 338 | (vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_PT_USE_GPA) && | ||
| 339 | (vmcs_config.vmexit_ctrl & VM_EXIT_CLEAR_IA32_RTIT_CTL) && | ||
| 340 | (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_RTIT_CTL); | ||
| 341 | } | ||
| 342 | |||
| 343 | #endif /* __KVM_X86_VMX_CAPS_H */ | ||
diff --git a/arch/x86/kvm/vmx_evmcs.h b/arch/x86/kvm/vmx/evmcs.c index 210a884090ad..95bc2247478d 100644 --- a/arch/x86/kvm/vmx_evmcs.h +++ b/arch/x86/kvm/vmx/evmcs.c | |||
| @@ -1,20 +1,22 @@ | |||
| 1 | /* SPDX-License-Identifier: GPL-2.0 */ | 1 | // SPDX-License-Identifier: GPL-2.0 |
| 2 | #ifndef __KVM_X86_VMX_EVMCS_H | ||
| 3 | #define __KVM_X86_VMX_EVMCS_H | ||
| 4 | 2 | ||
| 5 | #include <asm/hyperv-tlfs.h> | 3 | #include <linux/errno.h> |
| 4 | #include <linux/smp.h> | ||
| 5 | |||
| 6 | #include "evmcs.h" | ||
| 7 | #include "vmcs.h" | ||
| 8 | #include "vmx.h" | ||
| 9 | |||
| 10 | DEFINE_STATIC_KEY_FALSE(enable_evmcs); | ||
| 11 | |||
| 12 | #if IS_ENABLED(CONFIG_HYPERV) | ||
| 6 | 13 | ||
| 7 | #define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n))))) | 14 | #define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n))))) |
| 8 | #define EVMCS1_OFFSET(x) offsetof(struct hv_enlightened_vmcs, x) | 15 | #define EVMCS1_OFFSET(x) offsetof(struct hv_enlightened_vmcs, x) |
| 9 | #define EVMCS1_FIELD(number, name, clean_field)[ROL16(number, 6)] = \ | 16 | #define EVMCS1_FIELD(number, name, clean_field)[ROL16(number, 6)] = \ |
| 10 | {EVMCS1_OFFSET(name), clean_field} | 17 | {EVMCS1_OFFSET(name), clean_field} |
| 11 | 18 | ||
| 12 | struct evmcs_field { | 19 | const struct evmcs_field vmcs_field_to_evmcs_1[] = { |
| 13 | u16 offset; | ||
| 14 | u16 clean_field; | ||
| 15 | }; | ||
| 16 | |||
| 17 | static const struct evmcs_field vmcs_field_to_evmcs_1[] = { | ||
| 18 | /* 64 bit rw */ | 20 | /* 64 bit rw */ |
| 19 | EVMCS1_FIELD(GUEST_RIP, guest_rip, | 21 | EVMCS1_FIELD(GUEST_RIP, guest_rip, |
| 20 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), | 22 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), |
| @@ -298,27 +300,53 @@ static const struct evmcs_field vmcs_field_to_evmcs_1[] = { | |||
| 298 | EVMCS1_FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id, | 300 | EVMCS1_FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id, |
| 299 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT), | 301 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT), |
| 300 | }; | 302 | }; |
| 303 | const unsigned int nr_evmcs_1_fields = ARRAY_SIZE(vmcs_field_to_evmcs_1); | ||
| 301 | 304 | ||
| 302 | static __always_inline int get_evmcs_offset(unsigned long field, | 305 | void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf) |
| 303 | u16 *clean_field) | ||
| 304 | { | 306 | { |
| 305 | unsigned int index = ROL16(field, 6); | 307 | vmcs_conf->pin_based_exec_ctrl &= ~EVMCS1_UNSUPPORTED_PINCTRL; |
| 306 | const struct evmcs_field *evmcs_field; | 308 | vmcs_conf->cpu_based_2nd_exec_ctrl &= ~EVMCS1_UNSUPPORTED_2NDEXEC; |
| 307 | 309 | ||
| 308 | if (unlikely(index >= ARRAY_SIZE(vmcs_field_to_evmcs_1))) { | 310 | vmcs_conf->vmexit_ctrl &= ~EVMCS1_UNSUPPORTED_VMEXIT_CTRL; |
| 309 | WARN_ONCE(1, "KVM: accessing unsupported EVMCS field %lx\n", | 311 | vmcs_conf->vmentry_ctrl &= ~EVMCS1_UNSUPPORTED_VMENTRY_CTRL; |
| 310 | field); | ||
| 311 | return -ENOENT; | ||
| 312 | } | ||
| 313 | 312 | ||
| 314 | evmcs_field = &vmcs_field_to_evmcs_1[index]; | 313 | } |
| 314 | #endif | ||
| 315 | 315 | ||
| 316 | if (clean_field) | 316 | uint16_t nested_get_evmcs_version(struct kvm_vcpu *vcpu) |
| 317 | *clean_field = evmcs_field->clean_field; | 317 | { |
| 318 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 319 | /* | ||
| 320 | * vmcs_version represents the range of supported Enlightened VMCS | ||
| 321 | * versions: lower 8 bits is the minimal version, higher 8 bits is the | ||
| 322 | * maximum supported version. KVM supports versions from 1 to | ||
| 323 | * KVM_EVMCS_VERSION. | ||
| 324 | */ | ||
| 325 | if (vmx->nested.enlightened_vmcs_enabled) | ||
| 326 | return (KVM_EVMCS_VERSION << 8) | 1; | ||
| 318 | 327 | ||
| 319 | return evmcs_field->offset; | 328 | return 0; |
| 320 | } | 329 | } |
| 321 | 330 | ||
| 322 | #undef ROL16 | 331 | int nested_enable_evmcs(struct kvm_vcpu *vcpu, |
| 332 | uint16_t *vmcs_version) | ||
| 333 | { | ||
| 334 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 335 | |||
| 336 | if (vmcs_version) | ||
| 337 | *vmcs_version = nested_get_evmcs_version(vcpu); | ||
| 338 | |||
| 339 | /* We don't support disabling the feature for simplicity. */ | ||
| 340 | if (vmx->nested.enlightened_vmcs_enabled) | ||
| 341 | return 0; | ||
| 323 | 342 | ||
| 324 | #endif /* __KVM_X86_VMX_EVMCS_H */ | 343 | vmx->nested.enlightened_vmcs_enabled = true; |
| 344 | |||
| 345 | vmx->nested.msrs.pinbased_ctls_high &= ~EVMCS1_UNSUPPORTED_PINCTRL; | ||
| 346 | vmx->nested.msrs.entry_ctls_high &= ~EVMCS1_UNSUPPORTED_VMENTRY_CTRL; | ||
| 347 | vmx->nested.msrs.exit_ctls_high &= ~EVMCS1_UNSUPPORTED_VMEXIT_CTRL; | ||
| 348 | vmx->nested.msrs.secondary_ctls_high &= ~EVMCS1_UNSUPPORTED_2NDEXEC; | ||
| 349 | vmx->nested.msrs.vmfunc_controls &= ~EVMCS1_UNSUPPORTED_VMFUNC; | ||
| 350 | |||
| 351 | return 0; | ||
| 352 | } | ||
diff --git a/arch/x86/kvm/vmx/evmcs.h b/arch/x86/kvm/vmx/evmcs.h new file mode 100644 index 000000000000..e0fcef85b332 --- /dev/null +++ b/arch/x86/kvm/vmx/evmcs.h | |||
| @@ -0,0 +1,202 @@ | |||
| 1 | /* SPDX-License-Identifier: GPL-2.0 */ | ||
| 2 | #ifndef __KVM_X86_VMX_EVMCS_H | ||
| 3 | #define __KVM_X86_VMX_EVMCS_H | ||
| 4 | |||
| 5 | #include <linux/jump_label.h> | ||
| 6 | |||
| 7 | #include <asm/hyperv-tlfs.h> | ||
| 8 | #include <asm/mshyperv.h> | ||
| 9 | #include <asm/vmx.h> | ||
| 10 | |||
| 11 | #include "capabilities.h" | ||
| 12 | #include "vmcs.h" | ||
| 13 | |||
| 14 | struct vmcs_config; | ||
| 15 | |||
| 16 | DECLARE_STATIC_KEY_FALSE(enable_evmcs); | ||
| 17 | |||
| 18 | #define current_evmcs ((struct hv_enlightened_vmcs *)this_cpu_read(current_vmcs)) | ||
| 19 | |||
| 20 | #define KVM_EVMCS_VERSION 1 | ||
| 21 | |||
| 22 | /* | ||
| 23 | * Enlightened VMCSv1 doesn't support these: | ||
| 24 | * | ||
| 25 | * POSTED_INTR_NV = 0x00000002, | ||
| 26 | * GUEST_INTR_STATUS = 0x00000810, | ||
| 27 | * APIC_ACCESS_ADDR = 0x00002014, | ||
| 28 | * POSTED_INTR_DESC_ADDR = 0x00002016, | ||
| 29 | * EOI_EXIT_BITMAP0 = 0x0000201c, | ||
| 30 | * EOI_EXIT_BITMAP1 = 0x0000201e, | ||
| 31 | * EOI_EXIT_BITMAP2 = 0x00002020, | ||
| 32 | * EOI_EXIT_BITMAP3 = 0x00002022, | ||
| 33 | * GUEST_PML_INDEX = 0x00000812, | ||
| 34 | * PML_ADDRESS = 0x0000200e, | ||
| 35 | * VM_FUNCTION_CONTROL = 0x00002018, | ||
| 36 | * EPTP_LIST_ADDRESS = 0x00002024, | ||
| 37 | * VMREAD_BITMAP = 0x00002026, | ||
| 38 | * VMWRITE_BITMAP = 0x00002028, | ||
| 39 | * | ||
| 40 | * TSC_MULTIPLIER = 0x00002032, | ||
| 41 | * PLE_GAP = 0x00004020, | ||
| 42 | * PLE_WINDOW = 0x00004022, | ||
| 43 | * VMX_PREEMPTION_TIMER_VALUE = 0x0000482E, | ||
| 44 | * GUEST_IA32_PERF_GLOBAL_CTRL = 0x00002808, | ||
| 45 | * HOST_IA32_PERF_GLOBAL_CTRL = 0x00002c04, | ||
| 46 | * | ||
| 47 | * Currently unsupported in KVM: | ||
| 48 | * GUEST_IA32_RTIT_CTL = 0x00002814, | ||
| 49 | */ | ||
| 50 | #define EVMCS1_UNSUPPORTED_PINCTRL (PIN_BASED_POSTED_INTR | \ | ||
| 51 | PIN_BASED_VMX_PREEMPTION_TIMER) | ||
| 52 | #define EVMCS1_UNSUPPORTED_2NDEXEC \ | ||
| 53 | (SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | \ | ||
| 54 | SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | \ | ||
| 55 | SECONDARY_EXEC_APIC_REGISTER_VIRT | \ | ||
| 56 | SECONDARY_EXEC_ENABLE_PML | \ | ||
| 57 | SECONDARY_EXEC_ENABLE_VMFUNC | \ | ||
| 58 | SECONDARY_EXEC_SHADOW_VMCS | \ | ||
| 59 | SECONDARY_EXEC_TSC_SCALING | \ | ||
| 60 | SECONDARY_EXEC_PAUSE_LOOP_EXITING) | ||
| 61 | #define EVMCS1_UNSUPPORTED_VMEXIT_CTRL (VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) | ||
| 62 | #define EVMCS1_UNSUPPORTED_VMENTRY_CTRL (VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) | ||
| 63 | #define EVMCS1_UNSUPPORTED_VMFUNC (VMX_VMFUNC_EPTP_SWITCHING) | ||
| 64 | |||
| 65 | #if IS_ENABLED(CONFIG_HYPERV) | ||
| 66 | |||
| 67 | struct evmcs_field { | ||
| 68 | u16 offset; | ||
| 69 | u16 clean_field; | ||
| 70 | }; | ||
| 71 | |||
| 72 | extern const struct evmcs_field vmcs_field_to_evmcs_1[]; | ||
| 73 | extern const unsigned int nr_evmcs_1_fields; | ||
| 74 | |||
| 75 | #define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n))))) | ||
| 76 | |||
| 77 | static __always_inline int get_evmcs_offset(unsigned long field, | ||
| 78 | u16 *clean_field) | ||
| 79 | { | ||
| 80 | unsigned int index = ROL16(field, 6); | ||
| 81 | const struct evmcs_field *evmcs_field; | ||
| 82 | |||
| 83 | if (unlikely(index >= nr_evmcs_1_fields)) { | ||
| 84 | WARN_ONCE(1, "KVM: accessing unsupported EVMCS field %lx\n", | ||
| 85 | field); | ||
| 86 | return -ENOENT; | ||
| 87 | } | ||
| 88 | |||
| 89 | evmcs_field = &vmcs_field_to_evmcs_1[index]; | ||
| 90 | |||
| 91 | if (clean_field) | ||
| 92 | *clean_field = evmcs_field->clean_field; | ||
| 93 | |||
| 94 | return evmcs_field->offset; | ||
| 95 | } | ||
| 96 | |||
| 97 | #undef ROL16 | ||
| 98 | |||
| 99 | static inline void evmcs_write64(unsigned long field, u64 value) | ||
| 100 | { | ||
| 101 | u16 clean_field; | ||
| 102 | int offset = get_evmcs_offset(field, &clean_field); | ||
| 103 | |||
| 104 | if (offset < 0) | ||
| 105 | return; | ||
| 106 | |||
| 107 | *(u64 *)((char *)current_evmcs + offset) = value; | ||
| 108 | |||
| 109 | current_evmcs->hv_clean_fields &= ~clean_field; | ||
| 110 | } | ||
| 111 | |||
| 112 | static inline void evmcs_write32(unsigned long field, u32 value) | ||
| 113 | { | ||
| 114 | u16 clean_field; | ||
| 115 | int offset = get_evmcs_offset(field, &clean_field); | ||
| 116 | |||
| 117 | if (offset < 0) | ||
| 118 | return; | ||
| 119 | |||
| 120 | *(u32 *)((char *)current_evmcs + offset) = value; | ||
| 121 | current_evmcs->hv_clean_fields &= ~clean_field; | ||
| 122 | } | ||
| 123 | |||
| 124 | static inline void evmcs_write16(unsigned long field, u16 value) | ||
| 125 | { | ||
| 126 | u16 clean_field; | ||
| 127 | int offset = get_evmcs_offset(field, &clean_field); | ||
| 128 | |||
| 129 | if (offset < 0) | ||
| 130 | return; | ||
| 131 | |||
| 132 | *(u16 *)((char *)current_evmcs + offset) = value; | ||
| 133 | current_evmcs->hv_clean_fields &= ~clean_field; | ||
| 134 | } | ||
| 135 | |||
| 136 | static inline u64 evmcs_read64(unsigned long field) | ||
| 137 | { | ||
| 138 | int offset = get_evmcs_offset(field, NULL); | ||
| 139 | |||
| 140 | if (offset < 0) | ||
| 141 | return 0; | ||
| 142 | |||
| 143 | return *(u64 *)((char *)current_evmcs + offset); | ||
| 144 | } | ||
| 145 | |||
| 146 | static inline u32 evmcs_read32(unsigned long field) | ||
| 147 | { | ||
| 148 | int offset = get_evmcs_offset(field, NULL); | ||
| 149 | |||
| 150 | if (offset < 0) | ||
| 151 | return 0; | ||
| 152 | |||
| 153 | return *(u32 *)((char *)current_evmcs + offset); | ||
| 154 | } | ||
| 155 | |||
| 156 | static inline u16 evmcs_read16(unsigned long field) | ||
| 157 | { | ||
| 158 | int offset = get_evmcs_offset(field, NULL); | ||
| 159 | |||
| 160 | if (offset < 0) | ||
| 161 | return 0; | ||
| 162 | |||
| 163 | return *(u16 *)((char *)current_evmcs + offset); | ||
| 164 | } | ||
| 165 | |||
| 166 | static inline void evmcs_touch_msr_bitmap(void) | ||
| 167 | { | ||
| 168 | if (unlikely(!current_evmcs)) | ||
| 169 | return; | ||
| 170 | |||
| 171 | if (current_evmcs->hv_enlightenments_control.msr_bitmap) | ||
| 172 | current_evmcs->hv_clean_fields &= | ||
| 173 | ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP; | ||
| 174 | } | ||
| 175 | |||
| 176 | static inline void evmcs_load(u64 phys_addr) | ||
| 177 | { | ||
| 178 | struct hv_vp_assist_page *vp_ap = | ||
| 179 | hv_get_vp_assist_page(smp_processor_id()); | ||
| 180 | |||
| 181 | vp_ap->current_nested_vmcs = phys_addr; | ||
| 182 | vp_ap->enlighten_vmentry = 1; | ||
| 183 | } | ||
| 184 | |||
| 185 | void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf); | ||
| 186 | #else /* !IS_ENABLED(CONFIG_HYPERV) */ | ||
| 187 | static inline void evmcs_write64(unsigned long field, u64 value) {} | ||
| 188 | static inline void evmcs_write32(unsigned long field, u32 value) {} | ||
| 189 | static inline void evmcs_write16(unsigned long field, u16 value) {} | ||
| 190 | static inline u64 evmcs_read64(unsigned long field) { return 0; } | ||
| 191 | static inline u32 evmcs_read32(unsigned long field) { return 0; } | ||
| 192 | static inline u16 evmcs_read16(unsigned long field) { return 0; } | ||
| 193 | static inline void evmcs_load(u64 phys_addr) {} | ||
| 194 | static inline void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf) {} | ||
| 195 | static inline void evmcs_touch_msr_bitmap(void) {} | ||
| 196 | #endif /* IS_ENABLED(CONFIG_HYPERV) */ | ||
| 197 | |||
| 198 | uint16_t nested_get_evmcs_version(struct kvm_vcpu *vcpu); | ||
| 199 | int nested_enable_evmcs(struct kvm_vcpu *vcpu, | ||
| 200 | uint16_t *vmcs_version); | ||
| 201 | |||
| 202 | #endif /* __KVM_X86_VMX_EVMCS_H */ | ||
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c new file mode 100644 index 000000000000..3170e291215d --- /dev/null +++ b/arch/x86/kvm/vmx/nested.c | |||
| @@ -0,0 +1,5721 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0 | ||
| 2 | |||
| 3 | #include <linux/frame.h> | ||
| 4 | #include <linux/percpu.h> | ||
| 5 | |||
| 6 | #include <asm/debugreg.h> | ||
| 7 | #include <asm/mmu_context.h> | ||
| 8 | |||
| 9 | #include "cpuid.h" | ||
| 10 | #include "hyperv.h" | ||
| 11 | #include "mmu.h" | ||
| 12 | #include "nested.h" | ||
| 13 | #include "trace.h" | ||
| 14 | #include "x86.h" | ||
| 15 | |||
| 16 | static bool __read_mostly enable_shadow_vmcs = 1; | ||
| 17 | module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO); | ||
| 18 | |||
| 19 | static bool __read_mostly nested_early_check = 0; | ||
| 20 | module_param(nested_early_check, bool, S_IRUGO); | ||
| 21 | |||
| 22 | /* | ||
| 23 | * Hyper-V requires all of these, so mark them as supported even though | ||
| 24 | * they are just treated the same as all-context. | ||
| 25 | */ | ||
| 26 | #define VMX_VPID_EXTENT_SUPPORTED_MASK \ | ||
| 27 | (VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT | \ | ||
| 28 | VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT | \ | ||
| 29 | VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT | \ | ||
| 30 | VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT) | ||
| 31 | |||
| 32 | #define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5 | ||
| 33 | |||
| 34 | enum { | ||
| 35 | VMX_VMREAD_BITMAP, | ||
| 36 | VMX_VMWRITE_BITMAP, | ||
| 37 | VMX_BITMAP_NR | ||
| 38 | }; | ||
| 39 | static unsigned long *vmx_bitmap[VMX_BITMAP_NR]; | ||
| 40 | |||
| 41 | #define vmx_vmread_bitmap (vmx_bitmap[VMX_VMREAD_BITMAP]) | ||
| 42 | #define vmx_vmwrite_bitmap (vmx_bitmap[VMX_VMWRITE_BITMAP]) | ||
| 43 | |||
| 44 | static u16 shadow_read_only_fields[] = { | ||
| 45 | #define SHADOW_FIELD_RO(x) x, | ||
| 46 | #include "vmcs_shadow_fields.h" | ||
| 47 | }; | ||
| 48 | static int max_shadow_read_only_fields = | ||
| 49 | ARRAY_SIZE(shadow_read_only_fields); | ||
| 50 | |||
| 51 | static u16 shadow_read_write_fields[] = { | ||
| 52 | #define SHADOW_FIELD_RW(x) x, | ||
| 53 | #include "vmcs_shadow_fields.h" | ||
| 54 | }; | ||
| 55 | static int max_shadow_read_write_fields = | ||
| 56 | ARRAY_SIZE(shadow_read_write_fields); | ||
| 57 | |||
| 58 | void init_vmcs_shadow_fields(void) | ||
| 59 | { | ||
| 60 | int i, j; | ||
| 61 | |||
| 62 | memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); | ||
| 63 | memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); | ||
| 64 | |||
| 65 | for (i = j = 0; i < max_shadow_read_only_fields; i++) { | ||
| 66 | u16 field = shadow_read_only_fields[i]; | ||
| 67 | |||
| 68 | if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && | ||
| 69 | (i + 1 == max_shadow_read_only_fields || | ||
| 70 | shadow_read_only_fields[i + 1] != field + 1)) | ||
| 71 | pr_err("Missing field from shadow_read_only_field %x\n", | ||
| 72 | field + 1); | ||
| 73 | |||
| 74 | clear_bit(field, vmx_vmread_bitmap); | ||
| 75 | #ifdef CONFIG_X86_64 | ||
| 76 | if (field & 1) | ||
| 77 | continue; | ||
| 78 | #endif | ||
| 79 | if (j < i) | ||
| 80 | shadow_read_only_fields[j] = field; | ||
| 81 | j++; | ||
| 82 | } | ||
| 83 | max_shadow_read_only_fields = j; | ||
| 84 | |||
| 85 | for (i = j = 0; i < max_shadow_read_write_fields; i++) { | ||
| 86 | u16 field = shadow_read_write_fields[i]; | ||
| 87 | |||
| 88 | if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && | ||
| 89 | (i + 1 == max_shadow_read_write_fields || | ||
| 90 | shadow_read_write_fields[i + 1] != field + 1)) | ||
| 91 | pr_err("Missing field from shadow_read_write_field %x\n", | ||
| 92 | field + 1); | ||
| 93 | |||
| 94 | /* | ||
| 95 | * PML and the preemption timer can be emulated, but the | ||
| 96 | * processor cannot vmwrite to fields that don't exist | ||
| 97 | * on bare metal. | ||
| 98 | */ | ||
| 99 | switch (field) { | ||
| 100 | case GUEST_PML_INDEX: | ||
| 101 | if (!cpu_has_vmx_pml()) | ||
| 102 | continue; | ||
| 103 | break; | ||
| 104 | case VMX_PREEMPTION_TIMER_VALUE: | ||
| 105 | if (!cpu_has_vmx_preemption_timer()) | ||
| 106 | continue; | ||
| 107 | break; | ||
| 108 | case GUEST_INTR_STATUS: | ||
| 109 | if (!cpu_has_vmx_apicv()) | ||
| 110 | continue; | ||
| 111 | break; | ||
| 112 | default: | ||
| 113 | break; | ||
| 114 | } | ||
| 115 | |||
| 116 | clear_bit(field, vmx_vmwrite_bitmap); | ||
| 117 | clear_bit(field, vmx_vmread_bitmap); | ||
| 118 | #ifdef CONFIG_X86_64 | ||
| 119 | if (field & 1) | ||
| 120 | continue; | ||
| 121 | #endif | ||
| 122 | if (j < i) | ||
| 123 | shadow_read_write_fields[j] = field; | ||
| 124 | j++; | ||
| 125 | } | ||
| 126 | max_shadow_read_write_fields = j; | ||
| 127 | } | ||
| 128 | |||
| 129 | /* | ||
| 130 | * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(), | ||
| 131 | * set the success or error code of an emulated VMX instruction (as specified | ||
| 132 | * by Vol 2B, VMX Instruction Reference, "Conventions"), and skip the emulated | ||
| 133 | * instruction. | ||
| 134 | */ | ||
| 135 | static int nested_vmx_succeed(struct kvm_vcpu *vcpu) | ||
| 136 | { | ||
| 137 | vmx_set_rflags(vcpu, vmx_get_rflags(vcpu) | ||
| 138 | & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | | ||
| 139 | X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF)); | ||
| 140 | return kvm_skip_emulated_instruction(vcpu); | ||
| 141 | } | ||
| 142 | |||
| 143 | static int nested_vmx_failInvalid(struct kvm_vcpu *vcpu) | ||
| 144 | { | ||
| 145 | vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) | ||
| 146 | & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF | | ||
| 147 | X86_EFLAGS_SF | X86_EFLAGS_OF)) | ||
| 148 | | X86_EFLAGS_CF); | ||
| 149 | return kvm_skip_emulated_instruction(vcpu); | ||
| 150 | } | ||
| 151 | |||
| 152 | static int nested_vmx_failValid(struct kvm_vcpu *vcpu, | ||
| 153 | u32 vm_instruction_error) | ||
| 154 | { | ||
| 155 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 156 | |||
| 157 | /* | ||
| 158 | * failValid writes the error number to the current VMCS, which | ||
| 159 | * can't be done if there isn't a current VMCS. | ||
| 160 | */ | ||
| 161 | if (vmx->nested.current_vmptr == -1ull && !vmx->nested.hv_evmcs) | ||
| 162 | return nested_vmx_failInvalid(vcpu); | ||
| 163 | |||
| 164 | vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) | ||
| 165 | & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | | ||
| 166 | X86_EFLAGS_SF | X86_EFLAGS_OF)) | ||
| 167 | | X86_EFLAGS_ZF); | ||
| 168 | get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error; | ||
| 169 | /* | ||
| 170 | * We don't need to force a shadow sync because | ||
| 171 | * VM_INSTRUCTION_ERROR is not shadowed | ||
| 172 | */ | ||
| 173 | return kvm_skip_emulated_instruction(vcpu); | ||
| 174 | } | ||
| 175 | |||
| 176 | static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator) | ||
| 177 | { | ||
| 178 | /* TODO: not to reset guest simply here. */ | ||
| 179 | kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); | ||
| 180 | pr_debug_ratelimited("kvm: nested vmx abort, indicator %d\n", indicator); | ||
| 181 | } | ||
| 182 | |||
| 183 | static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx) | ||
| 184 | { | ||
| 185 | vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, SECONDARY_EXEC_SHADOW_VMCS); | ||
| 186 | vmcs_write64(VMCS_LINK_POINTER, -1ull); | ||
| 187 | } | ||
| 188 | |||
| 189 | static inline void nested_release_evmcs(struct kvm_vcpu *vcpu) | ||
| 190 | { | ||
| 191 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 192 | |||
| 193 | if (!vmx->nested.hv_evmcs) | ||
| 194 | return; | ||
| 195 | |||
| 196 | kunmap(vmx->nested.hv_evmcs_page); | ||
| 197 | kvm_release_page_dirty(vmx->nested.hv_evmcs_page); | ||
| 198 | vmx->nested.hv_evmcs_vmptr = -1ull; | ||
| 199 | vmx->nested.hv_evmcs_page = NULL; | ||
| 200 | vmx->nested.hv_evmcs = NULL; | ||
| 201 | } | ||
| 202 | |||
| 203 | /* | ||
| 204 | * Free whatever needs to be freed from vmx->nested when L1 goes down, or | ||
| 205 | * just stops using VMX. | ||
| 206 | */ | ||
| 207 | static void free_nested(struct kvm_vcpu *vcpu) | ||
| 208 | { | ||
| 209 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 210 | |||
| 211 | if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon) | ||
| 212 | return; | ||
| 213 | |||
| 214 | vmx->nested.vmxon = false; | ||
| 215 | vmx->nested.smm.vmxon = false; | ||
| 216 | free_vpid(vmx->nested.vpid02); | ||
| 217 | vmx->nested.posted_intr_nv = -1; | ||
| 218 | vmx->nested.current_vmptr = -1ull; | ||
| 219 | if (enable_shadow_vmcs) { | ||
| 220 | vmx_disable_shadow_vmcs(vmx); | ||
| 221 | vmcs_clear(vmx->vmcs01.shadow_vmcs); | ||
| 222 | free_vmcs(vmx->vmcs01.shadow_vmcs); | ||
| 223 | vmx->vmcs01.shadow_vmcs = NULL; | ||
| 224 | } | ||
| 225 | kfree(vmx->nested.cached_vmcs12); | ||
| 226 | kfree(vmx->nested.cached_shadow_vmcs12); | ||
| 227 | /* Unpin physical memory we referred to in the vmcs02 */ | ||
| 228 | if (vmx->nested.apic_access_page) { | ||
| 229 | kvm_release_page_dirty(vmx->nested.apic_access_page); | ||
| 230 | vmx->nested.apic_access_page = NULL; | ||
| 231 | } | ||
| 232 | if (vmx->nested.virtual_apic_page) { | ||
| 233 | kvm_release_page_dirty(vmx->nested.virtual_apic_page); | ||
| 234 | vmx->nested.virtual_apic_page = NULL; | ||
| 235 | } | ||
| 236 | if (vmx->nested.pi_desc_page) { | ||
| 237 | kunmap(vmx->nested.pi_desc_page); | ||
| 238 | kvm_release_page_dirty(vmx->nested.pi_desc_page); | ||
| 239 | vmx->nested.pi_desc_page = NULL; | ||
| 240 | vmx->nested.pi_desc = NULL; | ||
| 241 | } | ||
| 242 | |||
| 243 | kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); | ||
| 244 | |||
| 245 | nested_release_evmcs(vcpu); | ||
| 246 | |||
| 247 | free_loaded_vmcs(&vmx->nested.vmcs02); | ||
| 248 | } | ||
| 249 | |||
| 250 | static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) | ||
| 251 | { | ||
| 252 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 253 | int cpu; | ||
| 254 | |||
| 255 | if (vmx->loaded_vmcs == vmcs) | ||
| 256 | return; | ||
| 257 | |||
| 258 | cpu = get_cpu(); | ||
| 259 | vmx_vcpu_put(vcpu); | ||
| 260 | vmx->loaded_vmcs = vmcs; | ||
| 261 | vmx_vcpu_load(vcpu, cpu); | ||
| 262 | put_cpu(); | ||
| 263 | |||
| 264 | vm_entry_controls_reset_shadow(vmx); | ||
| 265 | vm_exit_controls_reset_shadow(vmx); | ||
| 266 | vmx_segment_cache_clear(vmx); | ||
| 267 | } | ||
| 268 | |||
| 269 | /* | ||
| 270 | * Ensure that the current vmcs of the logical processor is the | ||
| 271 | * vmcs01 of the vcpu before calling free_nested(). | ||
| 272 | */ | ||
| 273 | void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu) | ||
| 274 | { | ||
| 275 | vcpu_load(vcpu); | ||
| 276 | vmx_switch_vmcs(vcpu, &to_vmx(vcpu)->vmcs01); | ||
| 277 | free_nested(vcpu); | ||
| 278 | vcpu_put(vcpu); | ||
| 279 | } | ||
| 280 | |||
| 281 | static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, | ||
| 282 | struct x86_exception *fault) | ||
| 283 | { | ||
| 284 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); | ||
| 285 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 286 | u32 exit_reason; | ||
| 287 | unsigned long exit_qualification = vcpu->arch.exit_qualification; | ||
| 288 | |||
| 289 | if (vmx->nested.pml_full) { | ||
| 290 | exit_reason = EXIT_REASON_PML_FULL; | ||
| 291 | vmx->nested.pml_full = false; | ||
| 292 | exit_qualification &= INTR_INFO_UNBLOCK_NMI; | ||
| 293 | } else if (fault->error_code & PFERR_RSVD_MASK) | ||
| 294 | exit_reason = EXIT_REASON_EPT_MISCONFIG; | ||
| 295 | else | ||
| 296 | exit_reason = EXIT_REASON_EPT_VIOLATION; | ||
| 297 | |||
| 298 | nested_vmx_vmexit(vcpu, exit_reason, 0, exit_qualification); | ||
| 299 | vmcs12->guest_physical_address = fault->address; | ||
| 300 | } | ||
| 301 | |||
| 302 | static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) | ||
| 303 | { | ||
| 304 | WARN_ON(mmu_is_nested(vcpu)); | ||
| 305 | |||
| 306 | vcpu->arch.mmu = &vcpu->arch.guest_mmu; | ||
| 307 | kvm_init_shadow_ept_mmu(vcpu, | ||
| 308 | to_vmx(vcpu)->nested.msrs.ept_caps & | ||
| 309 | VMX_EPT_EXECUTE_ONLY_BIT, | ||
| 310 | nested_ept_ad_enabled(vcpu), | ||
| 311 | nested_ept_get_cr3(vcpu)); | ||
| 312 | vcpu->arch.mmu->set_cr3 = vmx_set_cr3; | ||
| 313 | vcpu->arch.mmu->get_cr3 = nested_ept_get_cr3; | ||
| 314 | vcpu->arch.mmu->inject_page_fault = nested_ept_inject_page_fault; | ||
| 315 | vcpu->arch.mmu->get_pdptr = kvm_pdptr_read; | ||
| 316 | |||
| 317 | vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; | ||
| 318 | } | ||
| 319 | |||
| 320 | static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu) | ||
| 321 | { | ||
| 322 | vcpu->arch.mmu = &vcpu->arch.root_mmu; | ||
| 323 | vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; | ||
| 324 | } | ||
| 325 | |||
| 326 | static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12, | ||
| 327 | u16 error_code) | ||
| 328 | { | ||
| 329 | bool inequality, bit; | ||
| 330 | |||
| 331 | bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0; | ||
| 332 | inequality = | ||
| 333 | (error_code & vmcs12->page_fault_error_code_mask) != | ||
| 334 | vmcs12->page_fault_error_code_match; | ||
| 335 | return inequality ^ bit; | ||
| 336 | } | ||
| 337 | |||
| 338 | |||
| 339 | /* | ||
| 340 | * KVM wants to inject page-faults which it got to the guest. This function | ||
| 341 | * checks whether in a nested guest, we need to inject them to L1 or L2. | ||
| 342 | */ | ||
| 343 | static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit_qual) | ||
| 344 | { | ||
| 345 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); | ||
| 346 | unsigned int nr = vcpu->arch.exception.nr; | ||
| 347 | bool has_payload = vcpu->arch.exception.has_payload; | ||
| 348 | unsigned long payload = vcpu->arch.exception.payload; | ||
| 349 | |||
| 350 | if (nr == PF_VECTOR) { | ||
| 351 | if (vcpu->arch.exception.nested_apf) { | ||
| 352 | *exit_qual = vcpu->arch.apf.nested_apf_token; | ||
| 353 | return 1; | ||
| 354 | } | ||
| 355 | if (nested_vmx_is_page_fault_vmexit(vmcs12, | ||
| 356 | vcpu->arch.exception.error_code)) { | ||
| 357 | *exit_qual = has_payload ? payload : vcpu->arch.cr2; | ||
| 358 | return 1; | ||
| 359 | } | ||
| 360 | } else if (vmcs12->exception_bitmap & (1u << nr)) { | ||
| 361 | if (nr == DB_VECTOR) { | ||
| 362 | if (!has_payload) { | ||
| 363 | payload = vcpu->arch.dr6; | ||
| 364 | payload &= ~(DR6_FIXED_1 | DR6_BT); | ||
| 365 | payload ^= DR6_RTM; | ||
| 366 | } | ||
| 367 | *exit_qual = payload; | ||
| 368 | } else | ||
| 369 | *exit_qual = 0; | ||
| 370 | return 1; | ||
| 371 | } | ||
| 372 | |||
| 373 | return 0; | ||
| 374 | } | ||
| 375 | |||
| 376 | |||
| 377 | static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu, | ||
| 378 | struct x86_exception *fault) | ||
| 379 | { | ||
| 380 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); | ||
| 381 | |||
| 382 | WARN_ON(!is_guest_mode(vcpu)); | ||
| 383 | |||
| 384 | if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code) && | ||
| 385 | !to_vmx(vcpu)->nested.nested_run_pending) { | ||
| 386 | vmcs12->vm_exit_intr_error_code = fault->error_code; | ||
| 387 | nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, | ||
| 388 | PF_VECTOR | INTR_TYPE_HARD_EXCEPTION | | ||
| 389 | INTR_INFO_DELIVER_CODE_MASK | INTR_INFO_VALID_MASK, | ||
| 390 | fault->address); | ||
| 391 | } else { | ||
| 392 | kvm_inject_page_fault(vcpu, fault); | ||
| 393 | } | ||
| 394 | } | ||
| 395 | |||
| 396 | static bool page_address_valid(struct kvm_vcpu *vcpu, gpa_t gpa) | ||
| 397 | { | ||
| 398 | return PAGE_ALIGNED(gpa) && !(gpa >> cpuid_maxphyaddr(vcpu)); | ||
| 399 | } | ||
| 400 | |||
| 401 | static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu, | ||
| 402 | struct vmcs12 *vmcs12) | ||
| 403 | { | ||
| 404 | if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) | ||
| 405 | return 0; | ||
| 406 | |||
| 407 | if (!page_address_valid(vcpu, vmcs12->io_bitmap_a) || | ||
| 408 | !page_address_valid(vcpu, vmcs12->io_bitmap_b)) | ||
| 409 | return -EINVAL; | ||
| 410 | |||
| 411 | return 0; | ||
| 412 | } | ||
| 413 | |||
| 414 | static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu, | ||
| 415 | struct vmcs12 *vmcs12) | ||
| 416 | { | ||
| 417 | if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) | ||
| 418 | return 0; | ||
| 419 | |||
| 420 | if (!page_address_valid(vcpu, vmcs12->msr_bitmap)) | ||
| 421 | return -EINVAL; | ||
| 422 | |||
| 423 | return 0; | ||
| 424 | } | ||
| 425 | |||
| 426 | static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu, | ||
| 427 | struct vmcs12 *vmcs12) | ||
| 428 | { | ||
| 429 | if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) | ||
| 430 | return 0; | ||
| 431 | |||
| 432 | if (!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr)) | ||
| 433 | return -EINVAL; | ||
| 434 | |||
| 435 | return 0; | ||
| 436 | } | ||
| 437 | |||
| 438 | /* | ||
| 439 | * Check if MSR is intercepted for L01 MSR bitmap. | ||
| 440 | */ | ||
| 441 | static bool msr_write_intercepted_l01(struct kvm_vcpu *vcpu, u32 msr) | ||
| 442 | { | ||
| 443 | unsigned long *msr_bitmap; | ||
| 444 | int f = sizeof(unsigned long); | ||
| 445 | |||
| 446 | if (!cpu_has_vmx_msr_bitmap()) | ||
| 447 | return true; | ||
| 448 | |||
| 449 | msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap; | ||
| 450 | |||
| 451 | if (msr <= 0x1fff) { | ||
| 452 | return !!test_bit(msr, msr_bitmap + 0x800 / f); | ||
| 453 | } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { | ||
| 454 | msr &= 0x1fff; | ||
| 455 | return !!test_bit(msr, msr_bitmap + 0xc00 / f); | ||
| 456 | } | ||
| 457 | |||
| 458 | return true; | ||
| 459 | } | ||
| 460 | |||
| 461 | /* | ||
| 462 | * If a msr is allowed by L0, we should check whether it is allowed by L1. | ||
| 463 | * The corresponding bit will be cleared unless both of L0 and L1 allow it. | ||
| 464 | */ | ||
| 465 | static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1, | ||
| 466 | unsigned long *msr_bitmap_nested, | ||
| 467 | u32 msr, int type) | ||
| 468 | { | ||
| 469 | int f = sizeof(unsigned long); | ||
| 470 | |||
| 471 | /* | ||
| 472 | * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals | ||
| 473 | * have the write-low and read-high bitmap offsets the wrong way round. | ||
| 474 | * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. | ||
| 475 | */ | ||
| 476 | if (msr <= 0x1fff) { | ||
| 477 | if (type & MSR_TYPE_R && | ||
| 478 | !test_bit(msr, msr_bitmap_l1 + 0x000 / f)) | ||
| 479 | /* read-low */ | ||
| 480 | __clear_bit(msr, msr_bitmap_nested + 0x000 / f); | ||
| 481 | |||
| 482 | if (type & MSR_TYPE_W && | ||
| 483 | !test_bit(msr, msr_bitmap_l1 + 0x800 / f)) | ||
| 484 | /* write-low */ | ||
| 485 | __clear_bit(msr, msr_bitmap_nested + 0x800 / f); | ||
| 486 | |||
| 487 | } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { | ||
| 488 | msr &= 0x1fff; | ||
| 489 | if (type & MSR_TYPE_R && | ||
| 490 | !test_bit(msr, msr_bitmap_l1 + 0x400 / f)) | ||
| 491 | /* read-high */ | ||
| 492 | __clear_bit(msr, msr_bitmap_nested + 0x400 / f); | ||
| 493 | |||
| 494 | if (type & MSR_TYPE_W && | ||
| 495 | !test_bit(msr, msr_bitmap_l1 + 0xc00 / f)) | ||
| 496 | /* write-high */ | ||
| 497 | __clear_bit(msr, msr_bitmap_nested + 0xc00 / f); | ||
| 498 | |||
| 499 | } | ||
| 500 | } | ||
| 501 | |||
| 502 | /* | ||
| 503 | * Merge L0's and L1's MSR bitmap, return false to indicate that | ||
| 504 | * we do not use the hardware. | ||
| 505 | */ | ||
| 506 | static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, | ||
| 507 | struct vmcs12 *vmcs12) | ||
| 508 | { | ||
| 509 | int msr; | ||
| 510 | struct page *page; | ||
| 511 | unsigned long *msr_bitmap_l1; | ||
| 512 | unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap; | ||
| 513 | /* | ||
| 514 | * pred_cmd & spec_ctrl are trying to verify two things: | ||
| 515 | * | ||
| 516 | * 1. L0 gave a permission to L1 to actually passthrough the MSR. This | ||
| 517 | * ensures that we do not accidentally generate an L02 MSR bitmap | ||
| 518 | * from the L12 MSR bitmap that is too permissive. | ||
| 519 | * 2. That L1 or L2s have actually used the MSR. This avoids | ||
| 520 | * unnecessarily merging of the bitmap if the MSR is unused. This | ||
| 521 | * works properly because we only update the L01 MSR bitmap lazily. | ||
| 522 | * So even if L0 should pass L1 these MSRs, the L01 bitmap is only | ||
| 523 | * updated to reflect this when L1 (or its L2s) actually write to | ||
| 524 | * the MSR. | ||
| 525 | */ | ||
| 526 | bool pred_cmd = !msr_write_intercepted_l01(vcpu, MSR_IA32_PRED_CMD); | ||
| 527 | bool spec_ctrl = !msr_write_intercepted_l01(vcpu, MSR_IA32_SPEC_CTRL); | ||
| 528 | |||
| 529 | /* Nothing to do if the MSR bitmap is not in use. */ | ||
| 530 | if (!cpu_has_vmx_msr_bitmap() || | ||
| 531 | !nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) | ||
| 532 | return false; | ||
| 533 | |||
| 534 | if (!nested_cpu_has_virt_x2apic_mode(vmcs12) && | ||
| 535 | !pred_cmd && !spec_ctrl) | ||
| 536 | return false; | ||
| 537 | |||
| 538 | page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->msr_bitmap); | ||
| 539 | if (is_error_page(page)) | ||
| 540 | return false; | ||
| 541 | |||
| 542 | msr_bitmap_l1 = (unsigned long *)kmap(page); | ||
| 543 | if (nested_cpu_has_apic_reg_virt(vmcs12)) { | ||
| 544 | /* | ||
| 545 | * L0 need not intercept reads for MSRs between 0x800 and 0x8ff, it | ||
| 546 | * just lets the processor take the value from the virtual-APIC page; | ||
| 547 | * take those 256 bits directly from the L1 bitmap. | ||
| 548 | */ | ||
| 549 | for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { | ||
| 550 | unsigned word = msr / BITS_PER_LONG; | ||
| 551 | msr_bitmap_l0[word] = msr_bitmap_l1[word]; | ||
| 552 | msr_bitmap_l0[word + (0x800 / sizeof(long))] = ~0; | ||
| 553 | } | ||
| 554 | } else { | ||
| 555 | for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { | ||
| 556 | unsigned word = msr / BITS_PER_LONG; | ||
| 557 | msr_bitmap_l0[word] = ~0; | ||
| 558 | msr_bitmap_l0[word + (0x800 / sizeof(long))] = ~0; | ||
| 559 | } | ||
| 560 | } | ||
| 561 | |||
| 562 | nested_vmx_disable_intercept_for_msr( | ||
| 563 | msr_bitmap_l1, msr_bitmap_l0, | ||
| 564 | X2APIC_MSR(APIC_TASKPRI), | ||
| 565 | MSR_TYPE_W); | ||
| 566 | |||
| 567 | if (nested_cpu_has_vid(vmcs12)) { | ||
| 568 | nested_vmx_disable_intercept_for_msr( | ||
| 569 | msr_bitmap_l1, msr_bitmap_l0, | ||
| 570 | X2APIC_MSR(APIC_EOI), | ||
| 571 | MSR_TYPE_W); | ||
| 572 | nested_vmx_disable_intercept_for_msr( | ||
| 573 | msr_bitmap_l1, msr_bitmap_l0, | ||
| 574 | X2APIC_MSR(APIC_SELF_IPI), | ||
| 575 | MSR_TYPE_W); | ||
| 576 | } | ||
| 577 | |||
| 578 | if (spec_ctrl) | ||
| 579 | nested_vmx_disable_intercept_for_msr( | ||
| 580 | msr_bitmap_l1, msr_bitmap_l0, | ||
| 581 | MSR_IA32_SPEC_CTRL, | ||
| 582 | MSR_TYPE_R | MSR_TYPE_W); | ||
| 583 | |||
| 584 | if (pred_cmd) | ||
| 585 | nested_vmx_disable_intercept_for_msr( | ||
| 586 | msr_bitmap_l1, msr_bitmap_l0, | ||
| 587 | MSR_IA32_PRED_CMD, | ||
| 588 | MSR_TYPE_W); | ||
| 589 | |||
| 590 | kunmap(page); | ||
| 591 | kvm_release_page_clean(page); | ||
| 592 | |||
| 593 | return true; | ||
| 594 | } | ||
| 595 | |||
| 596 | static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu, | ||
| 597 | struct vmcs12 *vmcs12) | ||
| 598 | { | ||
| 599 | struct vmcs12 *shadow; | ||
| 600 | struct page *page; | ||
| 601 | |||
| 602 | if (!nested_cpu_has_shadow_vmcs(vmcs12) || | ||
| 603 | vmcs12->vmcs_link_pointer == -1ull) | ||
| 604 | return; | ||
| 605 | |||
| 606 | shadow = get_shadow_vmcs12(vcpu); | ||
| 607 | page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->vmcs_link_pointer); | ||
| 608 | |||
| 609 | memcpy(shadow, kmap(page), VMCS12_SIZE); | ||
| 610 | |||
| 611 | kunmap(page); | ||
| 612 | kvm_release_page_clean(page); | ||
| 613 | } | ||
| 614 | |||
| 615 | static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu, | ||
| 616 | struct vmcs12 *vmcs12) | ||
| 617 | { | ||
| 618 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 619 | |||
| 620 | if (!nested_cpu_has_shadow_vmcs(vmcs12) || | ||
| 621 | vmcs12->vmcs_link_pointer == -1ull) | ||
| 622 | return; | ||
| 623 | |||
| 624 | kvm_write_guest(vmx->vcpu.kvm, vmcs12->vmcs_link_pointer, | ||
| 625 | get_shadow_vmcs12(vcpu), VMCS12_SIZE); | ||
| 626 | } | ||
| 627 | |||
| 628 | /* | ||
| 629 | * In nested virtualization, check if L1 has set | ||
| 630 | * VM_EXIT_ACK_INTR_ON_EXIT | ||
| 631 | */ | ||
| 632 | static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu) | ||
| 633 | { | ||
| 634 | return get_vmcs12(vcpu)->vm_exit_controls & | ||
| 635 | VM_EXIT_ACK_INTR_ON_EXIT; | ||
| 636 | } | ||
| 637 | |||
| 638 | static bool nested_exit_on_nmi(struct kvm_vcpu *vcpu) | ||
| 639 | { | ||
| 640 | return nested_cpu_has_nmi_exiting(get_vmcs12(vcpu)); | ||
| 641 | } | ||
| 642 | |||
| 643 | static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu, | ||
| 644 | struct vmcs12 *vmcs12) | ||
| 645 | { | ||
| 646 | if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) && | ||
| 647 | !page_address_valid(vcpu, vmcs12->apic_access_addr)) | ||
| 648 | return -EINVAL; | ||
| 649 | else | ||
| 650 | return 0; | ||
| 651 | } | ||
| 652 | |||
| 653 | static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu, | ||
| 654 | struct vmcs12 *vmcs12) | ||
| 655 | { | ||
| 656 | if (!nested_cpu_has_virt_x2apic_mode(vmcs12) && | ||
| 657 | !nested_cpu_has_apic_reg_virt(vmcs12) && | ||
| 658 | !nested_cpu_has_vid(vmcs12) && | ||
| 659 | !nested_cpu_has_posted_intr(vmcs12)) | ||
| 660 | return 0; | ||
| 661 | |||
| 662 | /* | ||
| 663 | * If virtualize x2apic mode is enabled, | ||
| 664 | * virtualize apic access must be disabled. | ||
| 665 | */ | ||
| 666 | if (nested_cpu_has_virt_x2apic_mode(vmcs12) && | ||
| 667 | nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) | ||
| 668 | return -EINVAL; | ||
| 669 | |||
| 670 | /* | ||
| 671 | * If virtual interrupt delivery is enabled, | ||
| 672 | * we must exit on external interrupts. | ||
| 673 | */ | ||
| 674 | if (nested_cpu_has_vid(vmcs12) && | ||
| 675 | !nested_exit_on_intr(vcpu)) | ||
| 676 | return -EINVAL; | ||
| 677 | |||
| 678 | /* | ||
| 679 | * bits 15:8 should be zero in posted_intr_nv, | ||
| 680 | * the descriptor address has been already checked | ||
| 681 | * in nested_get_vmcs12_pages. | ||
| 682 | * | ||
| 683 | * bits 5:0 of posted_intr_desc_addr should be zero. | ||
| 684 | */ | ||
| 685 | if (nested_cpu_has_posted_intr(vmcs12) && | ||
| 686 | (!nested_cpu_has_vid(vmcs12) || | ||
| 687 | !nested_exit_intr_ack_set(vcpu) || | ||
| 688 | (vmcs12->posted_intr_nv & 0xff00) || | ||
| 689 | (vmcs12->posted_intr_desc_addr & 0x3f) || | ||
| 690 | (vmcs12->posted_intr_desc_addr >> cpuid_maxphyaddr(vcpu)))) | ||
| 691 | return -EINVAL; | ||
| 692 | |||
| 693 | /* tpr shadow is needed by all apicv features. */ | ||
| 694 | if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) | ||
| 695 | return -EINVAL; | ||
| 696 | |||
| 697 | return 0; | ||
| 698 | } | ||
| 699 | |||
| 700 | static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu, | ||
| 701 | u32 count, u64 addr) | ||
| 702 | { | ||
| 703 | int maxphyaddr; | ||
| 704 | |||
| 705 | if (count == 0) | ||
| 706 | return 0; | ||
| 707 | maxphyaddr = cpuid_maxphyaddr(vcpu); | ||
| 708 | if (!IS_ALIGNED(addr, 16) || addr >> maxphyaddr || | ||
| 709 | (addr + count * sizeof(struct vmx_msr_entry) - 1) >> maxphyaddr) | ||
| 710 | return -EINVAL; | ||
| 711 | |||
| 712 | return 0; | ||
| 713 | } | ||
| 714 | |||
| 715 | static int nested_vmx_check_exit_msr_switch_controls(struct kvm_vcpu *vcpu, | ||
| 716 | struct vmcs12 *vmcs12) | ||
| 717 | { | ||
| 718 | if (nested_vmx_check_msr_switch(vcpu, vmcs12->vm_exit_msr_load_count, | ||
| 719 | vmcs12->vm_exit_msr_load_addr) || | ||
| 720 | nested_vmx_check_msr_switch(vcpu, vmcs12->vm_exit_msr_store_count, | ||
| 721 | vmcs12->vm_exit_msr_store_addr)) | ||
| 722 | return -EINVAL; | ||
| 723 | |||
| 724 | return 0; | ||
| 725 | } | ||
| 726 | |||
| 727 | static int nested_vmx_check_entry_msr_switch_controls(struct kvm_vcpu *vcpu, | ||
| 728 | struct vmcs12 *vmcs12) | ||
| 729 | { | ||
| 730 | if (nested_vmx_check_msr_switch(vcpu, vmcs12->vm_entry_msr_load_count, | ||
| 731 | vmcs12->vm_entry_msr_load_addr)) | ||
| 732 | return -EINVAL; | ||
| 733 | |||
| 734 | return 0; | ||
| 735 | } | ||
| 736 | |||
| 737 | static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu, | ||
| 738 | struct vmcs12 *vmcs12) | ||
| 739 | { | ||
| 740 | if (!nested_cpu_has_pml(vmcs12)) | ||
| 741 | return 0; | ||
| 742 | |||
| 743 | if (!nested_cpu_has_ept(vmcs12) || | ||
| 744 | !page_address_valid(vcpu, vmcs12->pml_address)) | ||
| 745 | return -EINVAL; | ||
| 746 | |||
| 747 | return 0; | ||
| 748 | } | ||
| 749 | |||
| 750 | static int nested_vmx_check_unrestricted_guest_controls(struct kvm_vcpu *vcpu, | ||
| 751 | struct vmcs12 *vmcs12) | ||
| 752 | { | ||
| 753 | if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST) && | ||
| 754 | !nested_cpu_has_ept(vmcs12)) | ||
| 755 | return -EINVAL; | ||
| 756 | return 0; | ||
| 757 | } | ||
| 758 | |||
| 759 | static int nested_vmx_check_mode_based_ept_exec_controls(struct kvm_vcpu *vcpu, | ||
| 760 | struct vmcs12 *vmcs12) | ||
| 761 | { | ||
| 762 | if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_MODE_BASED_EPT_EXEC) && | ||
| 763 | !nested_cpu_has_ept(vmcs12)) | ||
| 764 | return -EINVAL; | ||
| 765 | return 0; | ||
| 766 | } | ||
| 767 | |||
| 768 | static int nested_vmx_check_shadow_vmcs_controls(struct kvm_vcpu *vcpu, | ||
| 769 | struct vmcs12 *vmcs12) | ||
| 770 | { | ||
| 771 | if (!nested_cpu_has_shadow_vmcs(vmcs12)) | ||
| 772 | return 0; | ||
| 773 | |||
| 774 | if (!page_address_valid(vcpu, vmcs12->vmread_bitmap) || | ||
| 775 | !page_address_valid(vcpu, vmcs12->vmwrite_bitmap)) | ||
| 776 | return -EINVAL; | ||
| 777 | |||
| 778 | return 0; | ||
| 779 | } | ||
| 780 | |||
| 781 | static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu, | ||
| 782 | struct vmx_msr_entry *e) | ||
| 783 | { | ||
| 784 | /* x2APIC MSR accesses are not allowed */ | ||
| 785 | if (vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8) | ||
| 786 | return -EINVAL; | ||
| 787 | if (e->index == MSR_IA32_UCODE_WRITE || /* SDM Table 35-2 */ | ||
| 788 | e->index == MSR_IA32_UCODE_REV) | ||
| 789 | return -EINVAL; | ||
| 790 | if (e->reserved != 0) | ||
| 791 | return -EINVAL; | ||
| 792 | return 0; | ||
| 793 | } | ||
| 794 | |||
| 795 | static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu, | ||
| 796 | struct vmx_msr_entry *e) | ||
| 797 | { | ||
| 798 | if (e->index == MSR_FS_BASE || | ||
| 799 | e->index == MSR_GS_BASE || | ||
| 800 | e->index == MSR_IA32_SMM_MONITOR_CTL || /* SMM is not supported */ | ||
| 801 | nested_vmx_msr_check_common(vcpu, e)) | ||
| 802 | return -EINVAL; | ||
| 803 | return 0; | ||
| 804 | } | ||
| 805 | |||
| 806 | static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu, | ||
| 807 | struct vmx_msr_entry *e) | ||
| 808 | { | ||
| 809 | if (e->index == MSR_IA32_SMBASE || /* SMM is not supported */ | ||
| 810 | nested_vmx_msr_check_common(vcpu, e)) | ||
| 811 | return -EINVAL; | ||
| 812 | return 0; | ||
| 813 | } | ||
| 814 | |||
| 815 | /* | ||
| 816 | * Load guest's/host's msr at nested entry/exit. | ||
| 817 | * return 0 for success, entry index for failure. | ||
| 818 | */ | ||
| 819 | static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) | ||
| 820 | { | ||
| 821 | u32 i; | ||
| 822 | struct vmx_msr_entry e; | ||
| 823 | struct msr_data msr; | ||
| 824 | |||
| 825 | msr.host_initiated = false; | ||
| 826 | for (i = 0; i < count; i++) { | ||
| 827 | if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e), | ||
| 828 | &e, sizeof(e))) { | ||
| 829 | pr_debug_ratelimited( | ||
| 830 | "%s cannot read MSR entry (%u, 0x%08llx)\n", | ||
| 831 | __func__, i, gpa + i * sizeof(e)); | ||
| 832 | goto fail; | ||
| 833 | } | ||
| 834 | if (nested_vmx_load_msr_check(vcpu, &e)) { | ||
| 835 | pr_debug_ratelimited( | ||
| 836 | "%s check failed (%u, 0x%x, 0x%x)\n", | ||
| 837 | __func__, i, e.index, e.reserved); | ||
| 838 | goto fail; | ||
| 839 | } | ||
| 840 | msr.index = e.index; | ||
| 841 | msr.data = e.value; | ||
| 842 | if (kvm_set_msr(vcpu, &msr)) { | ||
| 843 | pr_debug_ratelimited( | ||
| 844 | "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", | ||
| 845 | __func__, i, e.index, e.value); | ||
| 846 | goto fail; | ||
| 847 | } | ||
| 848 | } | ||
| 849 | return 0; | ||
| 850 | fail: | ||
| 851 | return i + 1; | ||
| 852 | } | ||
| 853 | |||
| 854 | static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) | ||
| 855 | { | ||
| 856 | u32 i; | ||
| 857 | struct vmx_msr_entry e; | ||
| 858 | |||
| 859 | for (i = 0; i < count; i++) { | ||
| 860 | struct msr_data msr_info; | ||
| 861 | if (kvm_vcpu_read_guest(vcpu, | ||
| 862 | gpa + i * sizeof(e), | ||
| 863 | &e, 2 * sizeof(u32))) { | ||
| 864 | pr_debug_ratelimited( | ||
| 865 | "%s cannot read MSR entry (%u, 0x%08llx)\n", | ||
| 866 | __func__, i, gpa + i * sizeof(e)); | ||
| 867 | return -EINVAL; | ||
| 868 | } | ||
| 869 | if (nested_vmx_store_msr_check(vcpu, &e)) { | ||
| 870 | pr_debug_ratelimited( | ||
| 871 | "%s check failed (%u, 0x%x, 0x%x)\n", | ||
| 872 | __func__, i, e.index, e.reserved); | ||
| 873 | return -EINVAL; | ||
| 874 | } | ||
| 875 | msr_info.host_initiated = false; | ||
| 876 | msr_info.index = e.index; | ||
| 877 | if (kvm_get_msr(vcpu, &msr_info)) { | ||
| 878 | pr_debug_ratelimited( | ||
| 879 | "%s cannot read MSR (%u, 0x%x)\n", | ||
| 880 | __func__, i, e.index); | ||
| 881 | return -EINVAL; | ||
| 882 | } | ||
| 883 | if (kvm_vcpu_write_guest(vcpu, | ||
| 884 | gpa + i * sizeof(e) + | ||
| 885 | offsetof(struct vmx_msr_entry, value), | ||
| 886 | &msr_info.data, sizeof(msr_info.data))) { | ||
| 887 | pr_debug_ratelimited( | ||
| 888 | "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", | ||
| 889 | __func__, i, e.index, msr_info.data); | ||
| 890 | return -EINVAL; | ||
| 891 | } | ||
| 892 | } | ||
| 893 | return 0; | ||
| 894 | } | ||
| 895 | |||
| 896 | static bool nested_cr3_valid(struct kvm_vcpu *vcpu, unsigned long val) | ||
| 897 | { | ||
| 898 | unsigned long invalid_mask; | ||
| 899 | |||
| 900 | invalid_mask = (~0ULL) << cpuid_maxphyaddr(vcpu); | ||
| 901 | return (val & invalid_mask) == 0; | ||
| 902 | } | ||
| 903 | |||
| 904 | /* | ||
| 905 | * Load guest's/host's cr3 at nested entry/exit. nested_ept is true if we are | ||
| 906 | * emulating VM entry into a guest with EPT enabled. | ||
| 907 | * Returns 0 on success, 1 on failure. Invalid state exit qualification code | ||
| 908 | * is assigned to entry_failure_code on failure. | ||
| 909 | */ | ||
| 910 | static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool nested_ept, | ||
| 911 | u32 *entry_failure_code) | ||
| 912 | { | ||
| 913 | if (cr3 != kvm_read_cr3(vcpu) || (!nested_ept && pdptrs_changed(vcpu))) { | ||
| 914 | if (!nested_cr3_valid(vcpu, cr3)) { | ||
| 915 | *entry_failure_code = ENTRY_FAIL_DEFAULT; | ||
| 916 | return 1; | ||
| 917 | } | ||
| 918 | |||
| 919 | /* | ||
| 920 | * If PAE paging and EPT are both on, CR3 is not used by the CPU and | ||
| 921 | * must not be dereferenced. | ||
| 922 | */ | ||
| 923 | if (!is_long_mode(vcpu) && is_pae(vcpu) && is_paging(vcpu) && | ||
| 924 | !nested_ept) { | ||
| 925 | if (!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) { | ||
| 926 | *entry_failure_code = ENTRY_FAIL_PDPTE; | ||
| 927 | return 1; | ||
| 928 | } | ||
| 929 | } | ||
| 930 | } | ||
| 931 | |||
| 932 | if (!nested_ept) | ||
| 933 | kvm_mmu_new_cr3(vcpu, cr3, false); | ||
| 934 | |||
| 935 | vcpu->arch.cr3 = cr3; | ||
| 936 | __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); | ||
| 937 | |||
| 938 | kvm_init_mmu(vcpu, false); | ||
| 939 | |||
| 940 | return 0; | ||
| 941 | } | ||
| 942 | |||
| 943 | /* | ||
| 944 | * Returns if KVM is able to config CPU to tag TLB entries | ||
| 945 | * populated by L2 differently than TLB entries populated | ||
| 946 | * by L1. | ||
| 947 | * | ||
| 948 | * If L1 uses EPT, then TLB entries are tagged with different EPTP. | ||
| 949 | * | ||
| 950 | * If L1 uses VPID and we allocated a vpid02, TLB entries are tagged | ||
| 951 | * with different VPID (L1 entries are tagged with vmx->vpid | ||
| 952 | * while L2 entries are tagged with vmx->nested.vpid02). | ||
| 953 | */ | ||
| 954 | static bool nested_has_guest_tlb_tag(struct kvm_vcpu *vcpu) | ||
| 955 | { | ||
| 956 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); | ||
| 957 | |||
| 958 | return nested_cpu_has_ept(vmcs12) || | ||
| 959 | (nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02); | ||
| 960 | } | ||
| 961 | |||
| 962 | static u16 nested_get_vpid02(struct kvm_vcpu *vcpu) | ||
| 963 | { | ||
| 964 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 965 | |||
| 966 | return vmx->nested.vpid02 ? vmx->nested.vpid02 : vmx->vpid; | ||
| 967 | } | ||
| 968 | |||
| 969 | |||
| 970 | static inline bool vmx_control_verify(u32 control, u32 low, u32 high) | ||
| 971 | { | ||
| 972 | return fixed_bits_valid(control, low, high); | ||
| 973 | } | ||
| 974 | |||
| 975 | static inline u64 vmx_control_msr(u32 low, u32 high) | ||
| 976 | { | ||
| 977 | return low | ((u64)high << 32); | ||
| 978 | } | ||
| 979 | |||
| 980 | static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask) | ||
| 981 | { | ||
| 982 | superset &= mask; | ||
| 983 | subset &= mask; | ||
| 984 | |||
| 985 | return (superset | subset) == superset; | ||
| 986 | } | ||
| 987 | |||
| 988 | static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data) | ||
| 989 | { | ||
| 990 | const u64 feature_and_reserved = | ||
| 991 | /* feature (except bit 48; see below) */ | ||
| 992 | BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) | | ||
| 993 | /* reserved */ | ||
| 994 | BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56); | ||
| 995 | u64 vmx_basic = vmx->nested.msrs.basic; | ||
| 996 | |||
| 997 | if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved)) | ||
| 998 | return -EINVAL; | ||
| 999 | |||
| 1000 | /* | ||
| 1001 | * KVM does not emulate a version of VMX that constrains physical | ||
| 1002 | * addresses of VMX structures (e.g. VMCS) to 32-bits. | ||
| 1003 | */ | ||
| 1004 | if (data & BIT_ULL(48)) | ||
| 1005 | return -EINVAL; | ||
| 1006 | |||
| 1007 | if (vmx_basic_vmcs_revision_id(vmx_basic) != | ||
| 1008 | vmx_basic_vmcs_revision_id(data)) | ||
| 1009 | return -EINVAL; | ||
| 1010 | |||
| 1011 | if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data)) | ||
| 1012 | return -EINVAL; | ||
| 1013 | |||
| 1014 | vmx->nested.msrs.basic = data; | ||
| 1015 | return 0; | ||
| 1016 | } | ||
| 1017 | |||
| 1018 | static int | ||
| 1019 | vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) | ||
| 1020 | { | ||
| 1021 | u64 supported; | ||
| 1022 | u32 *lowp, *highp; | ||
| 1023 | |||
| 1024 | switch (msr_index) { | ||
| 1025 | case MSR_IA32_VMX_TRUE_PINBASED_CTLS: | ||
| 1026 | lowp = &vmx->nested.msrs.pinbased_ctls_low; | ||
| 1027 | highp = &vmx->nested.msrs.pinbased_ctls_high; | ||
| 1028 | break; | ||
| 1029 | case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: | ||
| 1030 | lowp = &vmx->nested.msrs.procbased_ctls_low; | ||
| 1031 | highp = &vmx->nested.msrs.procbased_ctls_high; | ||
| 1032 | break; | ||
| 1033 | case MSR_IA32_VMX_TRUE_EXIT_CTLS: | ||
| 1034 | lowp = &vmx->nested.msrs.exit_ctls_low; | ||
| 1035 | highp = &vmx->nested.msrs.exit_ctls_high; | ||
| 1036 | break; | ||
| 1037 | case MSR_IA32_VMX_TRUE_ENTRY_CTLS: | ||
| 1038 | lowp = &vmx->nested.msrs.entry_ctls_low; | ||
| 1039 | highp = &vmx->nested.msrs.entry_ctls_high; | ||
| 1040 | break; | ||
| 1041 | case MSR_IA32_VMX_PROCBASED_CTLS2: | ||
| 1042 | lowp = &vmx->nested.msrs.secondary_ctls_low; | ||
| 1043 | highp = &vmx->nested.msrs.secondary_ctls_high; | ||
| 1044 | break; | ||
| 1045 | default: | ||
| 1046 | BUG(); | ||
| 1047 | } | ||
| 1048 | |||
| 1049 | supported = vmx_control_msr(*lowp, *highp); | ||
| 1050 | |||
| 1051 | /* Check must-be-1 bits are still 1. */ | ||
| 1052 | if (!is_bitwise_subset(data, supported, GENMASK_ULL(31, 0))) | ||
| 1053 | return -EINVAL; | ||
| 1054 | |||
| 1055 | /* Check must-be-0 bits are still 0. */ | ||
| 1056 | if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32))) | ||
| 1057 | return -EINVAL; | ||
| 1058 | |||
| 1059 | *lowp = data; | ||
| 1060 | *highp = data >> 32; | ||
| 1061 | return 0; | ||
| 1062 | } | ||
| 1063 | |||
| 1064 | static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data) | ||
| 1065 | { | ||
| 1066 | const u64 feature_and_reserved_bits = | ||
| 1067 | /* feature */ | ||
| 1068 | BIT_ULL(5) | GENMASK_ULL(8, 6) | BIT_ULL(14) | BIT_ULL(15) | | ||
| 1069 | BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) | | ||
| 1070 | /* reserved */ | ||
| 1071 | GENMASK_ULL(13, 9) | BIT_ULL(31); | ||
| 1072 | u64 vmx_misc; | ||
| 1073 | |||
| 1074 | vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low, | ||
| 1075 | vmx->nested.msrs.misc_high); | ||
| 1076 | |||
| 1077 | if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits)) | ||
| 1078 | return -EINVAL; | ||
| 1079 | |||
| 1080 | if ((vmx->nested.msrs.pinbased_ctls_high & | ||
| 1081 | PIN_BASED_VMX_PREEMPTION_TIMER) && | ||
| 1082 | vmx_misc_preemption_timer_rate(data) != | ||
| 1083 | vmx_misc_preemption_timer_rate(vmx_misc)) | ||
| 1084 | return -EINVAL; | ||
| 1085 | |||
| 1086 | if (vmx_misc_cr3_count(data) > vmx_misc_cr3_count(vmx_misc)) | ||
| 1087 | return -EINVAL; | ||
| 1088 | |||
| 1089 | if (vmx_misc_max_msr(data) > vmx_misc_max_msr(vmx_misc)) | ||
| 1090 | return -EINVAL; | ||
| 1091 | |||
| 1092 | if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc)) | ||
| 1093 | return -EINVAL; | ||
| 1094 | |||
| 1095 | vmx->nested.msrs.misc_low = data; | ||
| 1096 | vmx->nested.msrs.misc_high = data >> 32; | ||
| 1097 | |||
| 1098 | /* | ||
| 1099 | * If L1 has read-only VM-exit information fields, use the | ||
| 1100 | * less permissive vmx_vmwrite_bitmap to specify write | ||
| 1101 | * permissions for the shadow VMCS. | ||
| 1102 | */ | ||
| 1103 | if (enable_shadow_vmcs && !nested_cpu_has_vmwrite_any_field(&vmx->vcpu)) | ||
| 1104 | vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap)); | ||
| 1105 | |||
| 1106 | return 0; | ||
| 1107 | } | ||
| 1108 | |||
| 1109 | static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data) | ||
| 1110 | { | ||
| 1111 | u64 vmx_ept_vpid_cap; | ||
| 1112 | |||
| 1113 | vmx_ept_vpid_cap = vmx_control_msr(vmx->nested.msrs.ept_caps, | ||
| 1114 | vmx->nested.msrs.vpid_caps); | ||
| 1115 | |||
| 1116 | /* Every bit is either reserved or a feature bit. */ | ||
| 1117 | if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL)) | ||
| 1118 | return -EINVAL; | ||
| 1119 | |||
| 1120 | vmx->nested.msrs.ept_caps = data; | ||
| 1121 | vmx->nested.msrs.vpid_caps = data >> 32; | ||
| 1122 | return 0; | ||
| 1123 | } | ||
| 1124 | |||
| 1125 | static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) | ||
| 1126 | { | ||
| 1127 | u64 *msr; | ||
| 1128 | |||
| 1129 | switch (msr_index) { | ||
| 1130 | case MSR_IA32_VMX_CR0_FIXED0: | ||
| 1131 | msr = &vmx->nested.msrs.cr0_fixed0; | ||
| 1132 | break; | ||
| 1133 | case MSR_IA32_VMX_CR4_FIXED0: | ||
| 1134 | msr = &vmx->nested.msrs.cr4_fixed0; | ||
| 1135 | break; | ||
| 1136 | default: | ||
| 1137 | BUG(); | ||
| 1138 | } | ||
| 1139 | |||
| 1140 | /* | ||
| 1141 | * 1 bits (which indicates bits which "must-be-1" during VMX operation) | ||
| 1142 | * must be 1 in the restored value. | ||
| 1143 | */ | ||
| 1144 | if (!is_bitwise_subset(data, *msr, -1ULL)) | ||
| 1145 | return -EINVAL; | ||
| 1146 | |||
| 1147 | *msr = data; | ||
| 1148 | return 0; | ||
| 1149 | } | ||
| 1150 | |||
| 1151 | /* | ||
| 1152 | * Called when userspace is restoring VMX MSRs. | ||
| 1153 | * | ||
| 1154 | * Returns 0 on success, non-0 otherwise. | ||
| 1155 | */ | ||
| 1156 | int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) | ||
| 1157 | { | ||
| 1158 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 1159 | |||
| 1160 | /* | ||
| 1161 | * Don't allow changes to the VMX capability MSRs while the vCPU | ||
| 1162 | * is in VMX operation. | ||
| 1163 | */ | ||
| 1164 | if (vmx->nested.vmxon) | ||
| 1165 | return -EBUSY; | ||
| 1166 | |||
| 1167 | switch (msr_index) { | ||
| 1168 | case MSR_IA32_VMX_BASIC: | ||
| 1169 | return vmx_restore_vmx_basic(vmx, data); | ||
| 1170 | case MSR_IA32_VMX_PINBASED_CTLS: | ||
| 1171 | case MSR_IA32_VMX_PROCBASED_CTLS: | ||
| 1172 | case MSR_IA32_VMX_EXIT_CTLS: | ||
| 1173 | case MSR_IA32_VMX_ENTRY_CTLS: | ||
| 1174 | /* | ||
| 1175 | * The "non-true" VMX capability MSRs are generated from the | ||
| 1176 | * "true" MSRs, so we do not support restoring them directly. | ||
| 1177 | * | ||
| 1178 | * If userspace wants to emulate VMX_BASIC[55]=0, userspace | ||
| 1179 | * should restore the "true" MSRs with the must-be-1 bits | ||
| 1180 | * set according to the SDM Vol 3. A.2 "RESERVED CONTROLS AND | ||
| 1181 | * DEFAULT SETTINGS". | ||
| 1182 | */ | ||
| 1183 | return -EINVAL; | ||
| 1184 | case MSR_IA32_VMX_TRUE_PINBASED_CTLS: | ||
| 1185 | case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: | ||
| 1186 | case MSR_IA32_VMX_TRUE_EXIT_CTLS: | ||
| 1187 | case MSR_IA32_VMX_TRUE_ENTRY_CTLS: | ||
| 1188 | case MSR_IA32_VMX_PROCBASED_CTLS2: | ||
| 1189 | return vmx_restore_control_msr(vmx, msr_index, data); | ||
| 1190 | case MSR_IA32_VMX_MISC: | ||
| 1191 | return vmx_restore_vmx_misc(vmx, data); | ||
| 1192 | case MSR_IA32_VMX_CR0_FIXED0: | ||
| 1193 | case MSR_IA32_VMX_CR4_FIXED0: | ||
| 1194 | return vmx_restore_fixed0_msr(vmx, msr_index, data); | ||
| 1195 | case MSR_IA32_VMX_CR0_FIXED1: | ||
| 1196 | case MSR_IA32_VMX_CR4_FIXED1: | ||
| 1197 | /* | ||
| 1198 | * These MSRs are generated based on the vCPU's CPUID, so we | ||
| 1199 | * do not support restoring them directly. | ||
| 1200 | */ | ||
| 1201 | return -EINVAL; | ||
| 1202 | case MSR_IA32_VMX_EPT_VPID_CAP: | ||
| 1203 | return vmx_restore_vmx_ept_vpid_cap(vmx, data); | ||
| 1204 | case MSR_IA32_VMX_VMCS_ENUM: | ||
| 1205 | vmx->nested.msrs.vmcs_enum = data; | ||
| 1206 | return 0; | ||
| 1207 | default: | ||
| 1208 | /* | ||
| 1209 | * The rest of the VMX capability MSRs do not support restore. | ||
| 1210 | */ | ||
| 1211 | return -EINVAL; | ||
| 1212 | } | ||
| 1213 | } | ||
| 1214 | |||
| 1215 | /* Returns 0 on success, non-0 otherwise. */ | ||
| 1216 | int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata) | ||
| 1217 | { | ||
| 1218 | switch (msr_index) { | ||
| 1219 | case MSR_IA32_VMX_BASIC: | ||
| 1220 | *pdata = msrs->basic; | ||
| 1221 | break; | ||
| 1222 | case MSR_IA32_VMX_TRUE_PINBASED_CTLS: | ||
| 1223 | case MSR_IA32_VMX_PINBASED_CTLS: | ||
| 1224 | *pdata = vmx_control_msr( | ||
| 1225 | msrs->pinbased_ctls_low, | ||
| 1226 | msrs->pinbased_ctls_high); | ||
| 1227 | if (msr_index == MSR_IA32_VMX_PINBASED_CTLS) | ||
| 1228 | *pdata |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; | ||
| 1229 | break; | ||
| 1230 | case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: | ||
| 1231 | case MSR_IA32_VMX_PROCBASED_CTLS: | ||
| 1232 | *pdata = vmx_control_msr( | ||
| 1233 | msrs->procbased_ctls_low, | ||
| 1234 | msrs->procbased_ctls_high); | ||
| 1235 | if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS) | ||
| 1236 | *pdata |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; | ||
| 1237 | break; | ||
| 1238 | case MSR_IA32_VMX_TRUE_EXIT_CTLS: | ||
| 1239 | case MSR_IA32_VMX_EXIT_CTLS: | ||
| 1240 | *pdata = vmx_control_msr( | ||
| 1241 | msrs->exit_ctls_low, | ||
| 1242 | msrs->exit_ctls_high); | ||
| 1243 | if (msr_index == MSR_IA32_VMX_EXIT_CTLS) | ||
| 1244 | *pdata |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; | ||
| 1245 | break; | ||
| 1246 | case MSR_IA32_VMX_TRUE_ENTRY_CTLS: | ||
| 1247 | case MSR_IA32_VMX_ENTRY_CTLS: | ||
| 1248 | *pdata = vmx_control_msr( | ||
| 1249 | msrs->entry_ctls_low, | ||
| 1250 | msrs->entry_ctls_high); | ||
| 1251 | if (msr_index == MSR_IA32_VMX_ENTRY_CTLS) | ||
| 1252 | *pdata |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; | ||
| 1253 | break; | ||
| 1254 | case MSR_IA32_VMX_MISC: | ||
| 1255 | *pdata = vmx_control_msr( | ||
| 1256 | msrs->misc_low, | ||
| 1257 | msrs->misc_high); | ||
| 1258 | break; | ||
| 1259 | case MSR_IA32_VMX_CR0_FIXED0: | ||
| 1260 | *pdata = msrs->cr0_fixed0; | ||
| 1261 | break; | ||
| 1262 | case MSR_IA32_VMX_CR0_FIXED1: | ||
| 1263 | *pdata = msrs->cr0_fixed1; | ||
| 1264 | break; | ||
| 1265 | case MSR_IA32_VMX_CR4_FIXED0: | ||
| 1266 | *pdata = msrs->cr4_fixed0; | ||
| 1267 | break; | ||
| 1268 | case MSR_IA32_VMX_CR4_FIXED1: | ||
| 1269 | *pdata = msrs->cr4_fixed1; | ||
| 1270 | break; | ||
| 1271 | case MSR_IA32_VMX_VMCS_ENUM: | ||
| 1272 | *pdata = msrs->vmcs_enum; | ||
| 1273 | break; | ||
| 1274 | case MSR_IA32_VMX_PROCBASED_CTLS2: | ||
| 1275 | *pdata = vmx_control_msr( | ||
| 1276 | msrs->secondary_ctls_low, | ||
| 1277 | msrs->secondary_ctls_high); | ||
| 1278 | break; | ||
| 1279 | case MSR_IA32_VMX_EPT_VPID_CAP: | ||
| 1280 | *pdata = msrs->ept_caps | | ||
| 1281 | ((u64)msrs->vpid_caps << 32); | ||
| 1282 | break; | ||
| 1283 | case MSR_IA32_VMX_VMFUNC: | ||
| 1284 | *pdata = msrs->vmfunc_controls; | ||
| 1285 | break; | ||
| 1286 | default: | ||
| 1287 | return 1; | ||
| 1288 | } | ||
| 1289 | |||
| 1290 | return 0; | ||
| 1291 | } | ||
| 1292 | |||
| 1293 | /* | ||
| 1294 | * Copy the writable VMCS shadow fields back to the VMCS12, in case | ||
| 1295 | * they have been modified by the L1 guest. Note that the "read-only" | ||
| 1296 | * VM-exit information fields are actually writable if the vCPU is | ||
| 1297 | * configured to support "VMWRITE to any supported field in the VMCS." | ||
| 1298 | */ | ||
| 1299 | static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) | ||
| 1300 | { | ||
| 1301 | const u16 *fields[] = { | ||
| 1302 | shadow_read_write_fields, | ||
| 1303 | shadow_read_only_fields | ||
| 1304 | }; | ||
| 1305 | const int max_fields[] = { | ||
| 1306 | max_shadow_read_write_fields, | ||
| 1307 | max_shadow_read_only_fields | ||
| 1308 | }; | ||
| 1309 | int i, q; | ||
| 1310 | unsigned long field; | ||
| 1311 | u64 field_value; | ||
| 1312 | struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; | ||
| 1313 | |||
| 1314 | preempt_disable(); | ||
| 1315 | |||
| 1316 | vmcs_load(shadow_vmcs); | ||
| 1317 | |||
| 1318 | for (q = 0; q < ARRAY_SIZE(fields); q++) { | ||
| 1319 | for (i = 0; i < max_fields[q]; i++) { | ||
| 1320 | field = fields[q][i]; | ||
| 1321 | field_value = __vmcs_readl(field); | ||
| 1322 | vmcs12_write_any(get_vmcs12(&vmx->vcpu), field, field_value); | ||
| 1323 | } | ||
| 1324 | /* | ||
| 1325 | * Skip the VM-exit information fields if they are read-only. | ||
| 1326 | */ | ||
| 1327 | if (!nested_cpu_has_vmwrite_any_field(&vmx->vcpu)) | ||
| 1328 | break; | ||
| 1329 | } | ||
| 1330 | |||
| 1331 | vmcs_clear(shadow_vmcs); | ||
| 1332 | vmcs_load(vmx->loaded_vmcs->vmcs); | ||
| 1333 | |||
| 1334 | preempt_enable(); | ||
| 1335 | } | ||
| 1336 | |||
| 1337 | static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) | ||
| 1338 | { | ||
| 1339 | const u16 *fields[] = { | ||
| 1340 | shadow_read_write_fields, | ||
| 1341 | shadow_read_only_fields | ||
| 1342 | }; | ||
| 1343 | const int max_fields[] = { | ||
| 1344 | max_shadow_read_write_fields, | ||
| 1345 | max_shadow_read_only_fields | ||
| 1346 | }; | ||
| 1347 | int i, q; | ||
| 1348 | unsigned long field; | ||
| 1349 | u64 field_value = 0; | ||
| 1350 | struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; | ||
| 1351 | |||
| 1352 | vmcs_load(shadow_vmcs); | ||
| 1353 | |||
| 1354 | for (q = 0; q < ARRAY_SIZE(fields); q++) { | ||
| 1355 | for (i = 0; i < max_fields[q]; i++) { | ||
| 1356 | field = fields[q][i]; | ||
| 1357 | vmcs12_read_any(get_vmcs12(&vmx->vcpu), field, &field_value); | ||
| 1358 | __vmcs_writel(field, field_value); | ||
| 1359 | } | ||
| 1360 | } | ||
| 1361 | |||
| 1362 | vmcs_clear(shadow_vmcs); | ||
| 1363 | vmcs_load(vmx->loaded_vmcs->vmcs); | ||
| 1364 | } | ||
| 1365 | |||
| 1366 | static int copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx) | ||
| 1367 | { | ||
| 1368 | struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; | ||
| 1369 | struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs; | ||
| 1370 | |||
| 1371 | /* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */ | ||
| 1372 | vmcs12->tpr_threshold = evmcs->tpr_threshold; | ||
| 1373 | vmcs12->guest_rip = evmcs->guest_rip; | ||
| 1374 | |||
| 1375 | if (unlikely(!(evmcs->hv_clean_fields & | ||
| 1376 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC))) { | ||
| 1377 | vmcs12->guest_rsp = evmcs->guest_rsp; | ||
| 1378 | vmcs12->guest_rflags = evmcs->guest_rflags; | ||
| 1379 | vmcs12->guest_interruptibility_info = | ||
| 1380 | evmcs->guest_interruptibility_info; | ||
| 1381 | } | ||
| 1382 | |||
| 1383 | if (unlikely(!(evmcs->hv_clean_fields & | ||
| 1384 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) { | ||
| 1385 | vmcs12->cpu_based_vm_exec_control = | ||
| 1386 | evmcs->cpu_based_vm_exec_control; | ||
| 1387 | } | ||
| 1388 | |||
| 1389 | if (unlikely(!(evmcs->hv_clean_fields & | ||
| 1390 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) { | ||
| 1391 | vmcs12->exception_bitmap = evmcs->exception_bitmap; | ||
| 1392 | } | ||
| 1393 | |||
| 1394 | if (unlikely(!(evmcs->hv_clean_fields & | ||
| 1395 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY))) { | ||
| 1396 | vmcs12->vm_entry_controls = evmcs->vm_entry_controls; | ||
| 1397 | } | ||
| 1398 | |||
| 1399 | if (unlikely(!(evmcs->hv_clean_fields & | ||
| 1400 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT))) { | ||
| 1401 | vmcs12->vm_entry_intr_info_field = | ||
| 1402 | evmcs->vm_entry_intr_info_field; | ||
| 1403 | vmcs12->vm_entry_exception_error_code = | ||
| 1404 | evmcs->vm_entry_exception_error_code; | ||
| 1405 | vmcs12->vm_entry_instruction_len = | ||
| 1406 | evmcs->vm_entry_instruction_len; | ||
| 1407 | } | ||
| 1408 | |||
| 1409 | if (unlikely(!(evmcs->hv_clean_fields & | ||
| 1410 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) { | ||
| 1411 | vmcs12->host_ia32_pat = evmcs->host_ia32_pat; | ||
| 1412 | vmcs12->host_ia32_efer = evmcs->host_ia32_efer; | ||
| 1413 | vmcs12->host_cr0 = evmcs->host_cr0; | ||
| 1414 | vmcs12->host_cr3 = evmcs->host_cr3; | ||
| 1415 | vmcs12->host_cr4 = evmcs->host_cr4; | ||
| 1416 | vmcs12->host_ia32_sysenter_esp = evmcs->host_ia32_sysenter_esp; | ||
| 1417 | vmcs12->host_ia32_sysenter_eip = evmcs->host_ia32_sysenter_eip; | ||
| 1418 | vmcs12->host_rip = evmcs->host_rip; | ||
| 1419 | vmcs12->host_ia32_sysenter_cs = evmcs->host_ia32_sysenter_cs; | ||
| 1420 | vmcs12->host_es_selector = evmcs->host_es_selector; | ||
| 1421 | vmcs12->host_cs_selector = evmcs->host_cs_selector; | ||
| 1422 | vmcs12->host_ss_selector = evmcs->host_ss_selector; | ||
| 1423 | vmcs12->host_ds_selector = evmcs->host_ds_selector; | ||
| 1424 | vmcs12->host_fs_selector = evmcs->host_fs_selector; | ||
| 1425 | vmcs12->host_gs_selector = evmcs->host_gs_selector; | ||
| 1426 | vmcs12->host_tr_selector = evmcs->host_tr_selector; | ||
| 1427 | } | ||
| 1428 | |||
| 1429 | if (unlikely(!(evmcs->hv_clean_fields & | ||
| 1430 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) { | ||
| 1431 | vmcs12->pin_based_vm_exec_control = | ||
| 1432 | evmcs->pin_based_vm_exec_control; | ||
| 1433 | vmcs12->vm_exit_controls = evmcs->vm_exit_controls; | ||
| 1434 | vmcs12->secondary_vm_exec_control = | ||
| 1435 | evmcs->secondary_vm_exec_control; | ||
| 1436 | } | ||
| 1437 | |||
| 1438 | if (unlikely(!(evmcs->hv_clean_fields & | ||
| 1439 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP))) { | ||
| 1440 | vmcs12->io_bitmap_a = evmcs->io_bitmap_a; | ||
| 1441 | vmcs12->io_bitmap_b = evmcs->io_bitmap_b; | ||
| 1442 | } | ||
| 1443 | |||
| 1444 | if (unlikely(!(evmcs->hv_clean_fields & | ||
| 1445 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP))) { | ||
| 1446 | vmcs12->msr_bitmap = evmcs->msr_bitmap; | ||
| 1447 | } | ||
| 1448 | |||
| 1449 | if (unlikely(!(evmcs->hv_clean_fields & | ||
| 1450 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2))) { | ||
| 1451 | vmcs12->guest_es_base = evmcs->guest_es_base; | ||
| 1452 | vmcs12->guest_cs_base = evmcs->guest_cs_base; | ||
| 1453 | vmcs12->guest_ss_base = evmcs->guest_ss_base; | ||
| 1454 | vmcs12->guest_ds_base = evmcs->guest_ds_base; | ||
| 1455 | vmcs12->guest_fs_base = evmcs->guest_fs_base; | ||
| 1456 | vmcs12->guest_gs_base = evmcs->guest_gs_base; | ||
| 1457 | vmcs12->guest_ldtr_base = evmcs->guest_ldtr_base; | ||
| 1458 | vmcs12->guest_tr_base = evmcs->guest_tr_base; | ||
| 1459 | vmcs12->guest_gdtr_base = evmcs->guest_gdtr_base; | ||
| 1460 | vmcs12->guest_idtr_base = evmcs->guest_idtr_base; | ||
| 1461 | vmcs12->guest_es_limit = evmcs->guest_es_limit; | ||
| 1462 | vmcs12->guest_cs_limit = evmcs->guest_cs_limit; | ||
| 1463 | vmcs12->guest_ss_limit = evmcs->guest_ss_limit; | ||
| 1464 | vmcs12->guest_ds_limit = evmcs->guest_ds_limit; | ||
| 1465 | vmcs12->guest_fs_limit = evmcs->guest_fs_limit; | ||
| 1466 | vmcs12->guest_gs_limit = evmcs->guest_gs_limit; | ||
| 1467 | vmcs12->guest_ldtr_limit = evmcs->guest_ldtr_limit; | ||
| 1468 | vmcs12->guest_tr_limit = evmcs->guest_tr_limit; | ||
| 1469 | vmcs12->guest_gdtr_limit = evmcs->guest_gdtr_limit; | ||
| 1470 | vmcs12->guest_idtr_limit = evmcs->guest_idtr_limit; | ||
| 1471 | vmcs12->guest_es_ar_bytes = evmcs->guest_es_ar_bytes; | ||
| 1472 | vmcs12->guest_cs_ar_bytes = evmcs->guest_cs_ar_bytes; | ||
| 1473 | vmcs12->guest_ss_ar_bytes = evmcs->guest_ss_ar_bytes; | ||
| 1474 | vmcs12->guest_ds_ar_bytes = evmcs->guest_ds_ar_bytes; | ||
| 1475 | vmcs12->guest_fs_ar_bytes = evmcs->guest_fs_ar_bytes; | ||
| 1476 | vmcs12->guest_gs_ar_bytes = evmcs->guest_gs_ar_bytes; | ||
| 1477 | vmcs12->guest_ldtr_ar_bytes = evmcs->guest_ldtr_ar_bytes; | ||
| 1478 | vmcs12->guest_tr_ar_bytes = evmcs->guest_tr_ar_bytes; | ||
| 1479 | vmcs12->guest_es_selector = evmcs->guest_es_selector; | ||
| 1480 | vmcs12->guest_cs_selector = evmcs->guest_cs_selector; | ||
| 1481 | vmcs12->guest_ss_selector = evmcs->guest_ss_selector; | ||
| 1482 | vmcs12->guest_ds_selector = evmcs->guest_ds_selector; | ||
| 1483 | vmcs12->guest_fs_selector = evmcs->guest_fs_selector; | ||
| 1484 | vmcs12->guest_gs_selector = evmcs->guest_gs_selector; | ||
| 1485 | vmcs12->guest_ldtr_selector = evmcs->guest_ldtr_selector; | ||
| 1486 | vmcs12->guest_tr_selector = evmcs->guest_tr_selector; | ||
| 1487 | } | ||
| 1488 | |||
| 1489 | if (unlikely(!(evmcs->hv_clean_fields & | ||
| 1490 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2))) { | ||
| 1491 | vmcs12->tsc_offset = evmcs->tsc_offset; | ||
| 1492 | vmcs12->virtual_apic_page_addr = evmcs->virtual_apic_page_addr; | ||
| 1493 | vmcs12->xss_exit_bitmap = evmcs->xss_exit_bitmap; | ||
| 1494 | } | ||
| 1495 | |||
| 1496 | if (unlikely(!(evmcs->hv_clean_fields & | ||
| 1497 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR))) { | ||
| 1498 | vmcs12->cr0_guest_host_mask = evmcs->cr0_guest_host_mask; | ||
| 1499 | vmcs12->cr4_guest_host_mask = evmcs->cr4_guest_host_mask; | ||
| 1500 | vmcs12->cr0_read_shadow = evmcs->cr0_read_shadow; | ||
| 1501 | vmcs12->cr4_read_shadow = evmcs->cr4_read_shadow; | ||
| 1502 | vmcs12->guest_cr0 = evmcs->guest_cr0; | ||
| 1503 | vmcs12->guest_cr3 = evmcs->guest_cr3; | ||
| 1504 | vmcs12->guest_cr4 = evmcs->guest_cr4; | ||
| 1505 | vmcs12->guest_dr7 = evmcs->guest_dr7; | ||
| 1506 | } | ||
| 1507 | |||
| 1508 | if (unlikely(!(evmcs->hv_clean_fields & | ||
| 1509 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER))) { | ||
| 1510 | vmcs12->host_fs_base = evmcs->host_fs_base; | ||
| 1511 | vmcs12->host_gs_base = evmcs->host_gs_base; | ||
| 1512 | vmcs12->host_tr_base = evmcs->host_tr_base; | ||
| 1513 | vmcs12->host_gdtr_base = evmcs->host_gdtr_base; | ||
| 1514 | vmcs12->host_idtr_base = evmcs->host_idtr_base; | ||
| 1515 | vmcs12->host_rsp = evmcs->host_rsp; | ||
| 1516 | } | ||
| 1517 | |||
| 1518 | if (unlikely(!(evmcs->hv_clean_fields & | ||
| 1519 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT))) { | ||
| 1520 | vmcs12->ept_pointer = evmcs->ept_pointer; | ||
| 1521 | vmcs12->virtual_processor_id = evmcs->virtual_processor_id; | ||
| 1522 | } | ||
| 1523 | |||
| 1524 | if (unlikely(!(evmcs->hv_clean_fields & | ||
| 1525 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1))) { | ||
| 1526 | vmcs12->vmcs_link_pointer = evmcs->vmcs_link_pointer; | ||
| 1527 | vmcs12->guest_ia32_debugctl = evmcs->guest_ia32_debugctl; | ||
| 1528 | vmcs12->guest_ia32_pat = evmcs->guest_ia32_pat; | ||
| 1529 | vmcs12->guest_ia32_efer = evmcs->guest_ia32_efer; | ||
| 1530 | vmcs12->guest_pdptr0 = evmcs->guest_pdptr0; | ||
| 1531 | vmcs12->guest_pdptr1 = evmcs->guest_pdptr1; | ||
| 1532 | vmcs12->guest_pdptr2 = evmcs->guest_pdptr2; | ||
| 1533 | vmcs12->guest_pdptr3 = evmcs->guest_pdptr3; | ||
| 1534 | vmcs12->guest_pending_dbg_exceptions = | ||
| 1535 | evmcs->guest_pending_dbg_exceptions; | ||
| 1536 | vmcs12->guest_sysenter_esp = evmcs->guest_sysenter_esp; | ||
| 1537 | vmcs12->guest_sysenter_eip = evmcs->guest_sysenter_eip; | ||
| 1538 | vmcs12->guest_bndcfgs = evmcs->guest_bndcfgs; | ||
| 1539 | vmcs12->guest_activity_state = evmcs->guest_activity_state; | ||
| 1540 | vmcs12->guest_sysenter_cs = evmcs->guest_sysenter_cs; | ||
| 1541 | } | ||
| 1542 | |||
| 1543 | /* | ||
| 1544 | * Not used? | ||
| 1545 | * vmcs12->vm_exit_msr_store_addr = evmcs->vm_exit_msr_store_addr; | ||
| 1546 | * vmcs12->vm_exit_msr_load_addr = evmcs->vm_exit_msr_load_addr; | ||
| 1547 | * vmcs12->vm_entry_msr_load_addr = evmcs->vm_entry_msr_load_addr; | ||
| 1548 | * vmcs12->cr3_target_value0 = evmcs->cr3_target_value0; | ||
| 1549 | * vmcs12->cr3_target_value1 = evmcs->cr3_target_value1; | ||
| 1550 | * vmcs12->cr3_target_value2 = evmcs->cr3_target_value2; | ||
| 1551 | * vmcs12->cr3_target_value3 = evmcs->cr3_target_value3; | ||
| 1552 | * vmcs12->page_fault_error_code_mask = | ||
| 1553 | * evmcs->page_fault_error_code_mask; | ||
| 1554 | * vmcs12->page_fault_error_code_match = | ||
| 1555 | * evmcs->page_fault_error_code_match; | ||
| 1556 | * vmcs12->cr3_target_count = evmcs->cr3_target_count; | ||
| 1557 | * vmcs12->vm_exit_msr_store_count = evmcs->vm_exit_msr_store_count; | ||
| 1558 | * vmcs12->vm_exit_msr_load_count = evmcs->vm_exit_msr_load_count; | ||
| 1559 | * vmcs12->vm_entry_msr_load_count = evmcs->vm_entry_msr_load_count; | ||
| 1560 | */ | ||
| 1561 | |||
| 1562 | /* | ||
| 1563 | * Read only fields: | ||
| 1564 | * vmcs12->guest_physical_address = evmcs->guest_physical_address; | ||
| 1565 | * vmcs12->vm_instruction_error = evmcs->vm_instruction_error; | ||
| 1566 | * vmcs12->vm_exit_reason = evmcs->vm_exit_reason; | ||
| 1567 | * vmcs12->vm_exit_intr_info = evmcs->vm_exit_intr_info; | ||
| 1568 | * vmcs12->vm_exit_intr_error_code = evmcs->vm_exit_intr_error_code; | ||
| 1569 | * vmcs12->idt_vectoring_info_field = evmcs->idt_vectoring_info_field; | ||
| 1570 | * vmcs12->idt_vectoring_error_code = evmcs->idt_vectoring_error_code; | ||
| 1571 | * vmcs12->vm_exit_instruction_len = evmcs->vm_exit_instruction_len; | ||
| 1572 | * vmcs12->vmx_instruction_info = evmcs->vmx_instruction_info; | ||
| 1573 | * vmcs12->exit_qualification = evmcs->exit_qualification; | ||
| 1574 | * vmcs12->guest_linear_address = evmcs->guest_linear_address; | ||
| 1575 | * | ||
| 1576 | * Not present in struct vmcs12: | ||
| 1577 | * vmcs12->exit_io_instruction_ecx = evmcs->exit_io_instruction_ecx; | ||
| 1578 | * vmcs12->exit_io_instruction_esi = evmcs->exit_io_instruction_esi; | ||
| 1579 | * vmcs12->exit_io_instruction_edi = evmcs->exit_io_instruction_edi; | ||
| 1580 | * vmcs12->exit_io_instruction_eip = evmcs->exit_io_instruction_eip; | ||
| 1581 | */ | ||
| 1582 | |||
| 1583 | return 0; | ||
| 1584 | } | ||
| 1585 | |||
| 1586 | static int copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx) | ||
| 1587 | { | ||
| 1588 | struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; | ||
| 1589 | struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs; | ||
| 1590 | |||
| 1591 | /* | ||
| 1592 | * Should not be changed by KVM: | ||
| 1593 | * | ||
| 1594 | * evmcs->host_es_selector = vmcs12->host_es_selector; | ||
| 1595 | * evmcs->host_cs_selector = vmcs12->host_cs_selector; | ||
| 1596 | * evmcs->host_ss_selector = vmcs12->host_ss_selector; | ||
| 1597 | * evmcs->host_ds_selector = vmcs12->host_ds_selector; | ||
| 1598 | * evmcs->host_fs_selector = vmcs12->host_fs_selector; | ||
| 1599 | * evmcs->host_gs_selector = vmcs12->host_gs_selector; | ||
| 1600 | * evmcs->host_tr_selector = vmcs12->host_tr_selector; | ||
| 1601 | * evmcs->host_ia32_pat = vmcs12->host_ia32_pat; | ||
| 1602 | * evmcs->host_ia32_efer = vmcs12->host_ia32_efer; | ||
| 1603 | * evmcs->host_cr0 = vmcs12->host_cr0; | ||
| 1604 | * evmcs->host_cr3 = vmcs12->host_cr3; | ||
| 1605 | * evmcs->host_cr4 = vmcs12->host_cr4; | ||
| 1606 | * evmcs->host_ia32_sysenter_esp = vmcs12->host_ia32_sysenter_esp; | ||
| 1607 | * evmcs->host_ia32_sysenter_eip = vmcs12->host_ia32_sysenter_eip; | ||
| 1608 | * evmcs->host_rip = vmcs12->host_rip; | ||
| 1609 | * evmcs->host_ia32_sysenter_cs = vmcs12->host_ia32_sysenter_cs; | ||
| 1610 | * evmcs->host_fs_base = vmcs12->host_fs_base; | ||
| 1611 | * evmcs->host_gs_base = vmcs12->host_gs_base; | ||
| 1612 | * evmcs->host_tr_base = vmcs12->host_tr_base; | ||
| 1613 | * evmcs->host_gdtr_base = vmcs12->host_gdtr_base; | ||
| 1614 | * evmcs->host_idtr_base = vmcs12->host_idtr_base; | ||
| 1615 | * evmcs->host_rsp = vmcs12->host_rsp; | ||
| 1616 | * sync_vmcs12() doesn't read these: | ||
| 1617 | * evmcs->io_bitmap_a = vmcs12->io_bitmap_a; | ||
| 1618 | * evmcs->io_bitmap_b = vmcs12->io_bitmap_b; | ||
| 1619 | * evmcs->msr_bitmap = vmcs12->msr_bitmap; | ||
| 1620 | * evmcs->ept_pointer = vmcs12->ept_pointer; | ||
| 1621 | * evmcs->xss_exit_bitmap = vmcs12->xss_exit_bitmap; | ||
| 1622 | * evmcs->vm_exit_msr_store_addr = vmcs12->vm_exit_msr_store_addr; | ||
| 1623 | * evmcs->vm_exit_msr_load_addr = vmcs12->vm_exit_msr_load_addr; | ||
| 1624 | * evmcs->vm_entry_msr_load_addr = vmcs12->vm_entry_msr_load_addr; | ||
| 1625 | * evmcs->cr3_target_value0 = vmcs12->cr3_target_value0; | ||
| 1626 | * evmcs->cr3_target_value1 = vmcs12->cr3_target_value1; | ||
| 1627 | * evmcs->cr3_target_value2 = vmcs12->cr3_target_value2; | ||
| 1628 | * evmcs->cr3_target_value3 = vmcs12->cr3_target_value3; | ||
| 1629 | * evmcs->tpr_threshold = vmcs12->tpr_threshold; | ||
| 1630 | * evmcs->virtual_processor_id = vmcs12->virtual_processor_id; | ||
| 1631 | * evmcs->exception_bitmap = vmcs12->exception_bitmap; | ||
| 1632 | * evmcs->vmcs_link_pointer = vmcs12->vmcs_link_pointer; | ||
| 1633 | * evmcs->pin_based_vm_exec_control = vmcs12->pin_based_vm_exec_control; | ||
| 1634 | * evmcs->vm_exit_controls = vmcs12->vm_exit_controls; | ||
| 1635 | * evmcs->secondary_vm_exec_control = vmcs12->secondary_vm_exec_control; | ||
| 1636 | * evmcs->page_fault_error_code_mask = | ||
| 1637 | * vmcs12->page_fault_error_code_mask; | ||
| 1638 | * evmcs->page_fault_error_code_match = | ||
| 1639 | * vmcs12->page_fault_error_code_match; | ||
| 1640 | * evmcs->cr3_target_count = vmcs12->cr3_target_count; | ||
| 1641 | * evmcs->virtual_apic_page_addr = vmcs12->virtual_apic_page_addr; | ||
| 1642 | * evmcs->tsc_offset = vmcs12->tsc_offset; | ||
| 1643 | * evmcs->guest_ia32_debugctl = vmcs12->guest_ia32_debugctl; | ||
| 1644 | * evmcs->cr0_guest_host_mask = vmcs12->cr0_guest_host_mask; | ||
| 1645 | * evmcs->cr4_guest_host_mask = vmcs12->cr4_guest_host_mask; | ||
| 1646 | * evmcs->cr0_read_shadow = vmcs12->cr0_read_shadow; | ||
| 1647 | * evmcs->cr4_read_shadow = vmcs12->cr4_read_shadow; | ||
| 1648 | * evmcs->vm_exit_msr_store_count = vmcs12->vm_exit_msr_store_count; | ||
| 1649 | * evmcs->vm_exit_msr_load_count = vmcs12->vm_exit_msr_load_count; | ||
| 1650 | * evmcs->vm_entry_msr_load_count = vmcs12->vm_entry_msr_load_count; | ||
| 1651 | * | ||
| 1652 | * Not present in struct vmcs12: | ||
| 1653 | * evmcs->exit_io_instruction_ecx = vmcs12->exit_io_instruction_ecx; | ||
| 1654 | * evmcs->exit_io_instruction_esi = vmcs12->exit_io_instruction_esi; | ||
| 1655 | * evmcs->exit_io_instruction_edi = vmcs12->exit_io_instruction_edi; | ||
| 1656 | * evmcs->exit_io_instruction_eip = vmcs12->exit_io_instruction_eip; | ||
| 1657 | */ | ||
| 1658 | |||
| 1659 | evmcs->guest_es_selector = vmcs12->guest_es_selector; | ||
| 1660 | evmcs->guest_cs_selector = vmcs12->guest_cs_selector; | ||
| 1661 | evmcs->guest_ss_selector = vmcs12->guest_ss_selector; | ||
| 1662 | evmcs->guest_ds_selector = vmcs12->guest_ds_selector; | ||
| 1663 | evmcs->guest_fs_selector = vmcs12->guest_fs_selector; | ||
| 1664 | evmcs->guest_gs_selector = vmcs12->guest_gs_selector; | ||
| 1665 | evmcs->guest_ldtr_selector = vmcs12->guest_ldtr_selector; | ||
| 1666 | evmcs->guest_tr_selector = vmcs12->guest_tr_selector; | ||
| 1667 | |||
| 1668 | evmcs->guest_es_limit = vmcs12->guest_es_limit; | ||
| 1669 | evmcs->guest_cs_limit = vmcs12->guest_cs_limit; | ||
| 1670 | evmcs->guest_ss_limit = vmcs12->guest_ss_limit; | ||
| 1671 | evmcs->guest_ds_limit = vmcs12->guest_ds_limit; | ||
| 1672 | evmcs->guest_fs_limit = vmcs12->guest_fs_limit; | ||
| 1673 | evmcs->guest_gs_limit = vmcs12->guest_gs_limit; | ||
| 1674 | evmcs->guest_ldtr_limit = vmcs12->guest_ldtr_limit; | ||
| 1675 | evmcs->guest_tr_limit = vmcs12->guest_tr_limit; | ||
| 1676 | evmcs->guest_gdtr_limit = vmcs12->guest_gdtr_limit; | ||
| 1677 | evmcs->guest_idtr_limit = vmcs12->guest_idtr_limit; | ||
| 1678 | |||
| 1679 | evmcs->guest_es_ar_bytes = vmcs12->guest_es_ar_bytes; | ||
| 1680 | evmcs->guest_cs_ar_bytes = vmcs12->guest_cs_ar_bytes; | ||
| 1681 | evmcs->guest_ss_ar_bytes = vmcs12->guest_ss_ar_bytes; | ||
| 1682 | evmcs->guest_ds_ar_bytes = vmcs12->guest_ds_ar_bytes; | ||
| 1683 | evmcs->guest_fs_ar_bytes = vmcs12->guest_fs_ar_bytes; | ||
| 1684 | evmcs->guest_gs_ar_bytes = vmcs12->guest_gs_ar_bytes; | ||
| 1685 | evmcs->guest_ldtr_ar_bytes = vmcs12->guest_ldtr_ar_bytes; | ||
| 1686 | evmcs->guest_tr_ar_bytes = vmcs12->guest_tr_ar_bytes; | ||
| 1687 | |||
| 1688 | evmcs->guest_es_base = vmcs12->guest_es_base; | ||
| 1689 | evmcs->guest_cs_base = vmcs12->guest_cs_base; | ||
| 1690 | evmcs->guest_ss_base = vmcs12->guest_ss_base; | ||
| 1691 | evmcs->guest_ds_base = vmcs12->guest_ds_base; | ||
| 1692 | evmcs->guest_fs_base = vmcs12->guest_fs_base; | ||
| 1693 | evmcs->guest_gs_base = vmcs12->guest_gs_base; | ||
| 1694 | evmcs->guest_ldtr_base = vmcs12->guest_ldtr_base; | ||
| 1695 | evmcs->guest_tr_base = vmcs12->guest_tr_base; | ||
| 1696 | evmcs->guest_gdtr_base = vmcs12->guest_gdtr_base; | ||
| 1697 | evmcs->guest_idtr_base = vmcs12->guest_idtr_base; | ||
| 1698 | |||
| 1699 | evmcs->guest_ia32_pat = vmcs12->guest_ia32_pat; | ||
| 1700 | evmcs->guest_ia32_efer = vmcs12->guest_ia32_efer; | ||
| 1701 | |||
| 1702 | evmcs->guest_pdptr0 = vmcs12->guest_pdptr0; | ||
| 1703 | evmcs->guest_pdptr1 = vmcs12->guest_pdptr1; | ||
| 1704 | evmcs->guest_pdptr2 = vmcs12->guest_pdptr2; | ||
| 1705 | evmcs->guest_pdptr3 = vmcs12->guest_pdptr3; | ||
| 1706 | |||
| 1707 | evmcs->guest_pending_dbg_exceptions = | ||
| 1708 | vmcs12->guest_pending_dbg_exceptions; | ||
| 1709 | evmcs->guest_sysenter_esp = vmcs12->guest_sysenter_esp; | ||
| 1710 | evmcs->guest_sysenter_eip = vmcs12->guest_sysenter_eip; | ||
| 1711 | |||
| 1712 | evmcs->guest_activity_state = vmcs12->guest_activity_state; | ||
| 1713 | evmcs->guest_sysenter_cs = vmcs12->guest_sysenter_cs; | ||
| 1714 | |||
| 1715 | evmcs->guest_cr0 = vmcs12->guest_cr0; | ||
| 1716 | evmcs->guest_cr3 = vmcs12->guest_cr3; | ||
| 1717 | evmcs->guest_cr4 = vmcs12->guest_cr4; | ||
| 1718 | evmcs->guest_dr7 = vmcs12->guest_dr7; | ||
| 1719 | |||
| 1720 | evmcs->guest_physical_address = vmcs12->guest_physical_address; | ||
| 1721 | |||
| 1722 | evmcs->vm_instruction_error = vmcs12->vm_instruction_error; | ||
| 1723 | evmcs->vm_exit_reason = vmcs12->vm_exit_reason; | ||
| 1724 | evmcs->vm_exit_intr_info = vmcs12->vm_exit_intr_info; | ||
| 1725 | evmcs->vm_exit_intr_error_code = vmcs12->vm_exit_intr_error_code; | ||
| 1726 | evmcs->idt_vectoring_info_field = vmcs12->idt_vectoring_info_field; | ||
| 1727 | evmcs->idt_vectoring_error_code = vmcs12->idt_vectoring_error_code; | ||
| 1728 | evmcs->vm_exit_instruction_len = vmcs12->vm_exit_instruction_len; | ||
| 1729 | evmcs->vmx_instruction_info = vmcs12->vmx_instruction_info; | ||
| 1730 | |||
| 1731 | evmcs->exit_qualification = vmcs12->exit_qualification; | ||
| 1732 | |||
| 1733 | evmcs->guest_linear_address = vmcs12->guest_linear_address; | ||
| 1734 | evmcs->guest_rsp = vmcs12->guest_rsp; | ||
| 1735 | evmcs->guest_rflags = vmcs12->guest_rflags; | ||
| 1736 | |||
| 1737 | evmcs->guest_interruptibility_info = | ||
| 1738 | vmcs12->guest_interruptibility_info; | ||
| 1739 | evmcs->cpu_based_vm_exec_control = vmcs12->cpu_based_vm_exec_control; | ||
| 1740 | evmcs->vm_entry_controls = vmcs12->vm_entry_controls; | ||
| 1741 | evmcs->vm_entry_intr_info_field = vmcs12->vm_entry_intr_info_field; | ||
| 1742 | evmcs->vm_entry_exception_error_code = | ||
| 1743 | vmcs12->vm_entry_exception_error_code; | ||
| 1744 | evmcs->vm_entry_instruction_len = vmcs12->vm_entry_instruction_len; | ||
| 1745 | |||
| 1746 | evmcs->guest_rip = vmcs12->guest_rip; | ||
| 1747 | |||
| 1748 | evmcs->guest_bndcfgs = vmcs12->guest_bndcfgs; | ||
| 1749 | |||
| 1750 | return 0; | ||
| 1751 | } | ||
| 1752 | |||
| 1753 | /* | ||
| 1754 | * This is an equivalent of the nested hypervisor executing the vmptrld | ||
| 1755 | * instruction. | ||
| 1756 | */ | ||
| 1757 | static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu, | ||
| 1758 | bool from_launch) | ||
| 1759 | { | ||
| 1760 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 1761 | struct hv_vp_assist_page assist_page; | ||
| 1762 | |||
| 1763 | if (likely(!vmx->nested.enlightened_vmcs_enabled)) | ||
| 1764 | return 1; | ||
| 1765 | |||
| 1766 | if (unlikely(!kvm_hv_get_assist_page(vcpu, &assist_page))) | ||
| 1767 | return 1; | ||
| 1768 | |||
| 1769 | if (unlikely(!assist_page.enlighten_vmentry)) | ||
| 1770 | return 1; | ||
| 1771 | |||
| 1772 | if (unlikely(assist_page.current_nested_vmcs != | ||
| 1773 | vmx->nested.hv_evmcs_vmptr)) { | ||
| 1774 | |||
| 1775 | if (!vmx->nested.hv_evmcs) | ||
| 1776 | vmx->nested.current_vmptr = -1ull; | ||
| 1777 | |||
| 1778 | nested_release_evmcs(vcpu); | ||
| 1779 | |||
| 1780 | vmx->nested.hv_evmcs_page = kvm_vcpu_gpa_to_page( | ||
| 1781 | vcpu, assist_page.current_nested_vmcs); | ||
| 1782 | |||
| 1783 | if (unlikely(is_error_page(vmx->nested.hv_evmcs_page))) | ||
| 1784 | return 0; | ||
| 1785 | |||
| 1786 | vmx->nested.hv_evmcs = kmap(vmx->nested.hv_evmcs_page); | ||
| 1787 | |||
| 1788 | /* | ||
| 1789 | * Currently, KVM only supports eVMCS version 1 | ||
| 1790 | * (== KVM_EVMCS_VERSION) and thus we expect guest to set this | ||
| 1791 | * value to first u32 field of eVMCS which should specify eVMCS | ||
| 1792 | * VersionNumber. | ||
| 1793 | * | ||
| 1794 | * Guest should be aware of supported eVMCS versions by host by | ||
| 1795 | * examining CPUID.0x4000000A.EAX[0:15]. Host userspace VMM is | ||
| 1796 | * expected to set this CPUID leaf according to the value | ||
| 1797 | * returned in vmcs_version from nested_enable_evmcs(). | ||
| 1798 | * | ||
| 1799 | * However, it turns out that Microsoft Hyper-V fails to comply | ||
| 1800 | * to their own invented interface: When Hyper-V use eVMCS, it | ||
| 1801 | * just sets first u32 field of eVMCS to revision_id specified | ||
| 1802 | * in MSR_IA32_VMX_BASIC. Instead of used eVMCS version number | ||
| 1803 | * which is one of the supported versions specified in | ||
| 1804 | * CPUID.0x4000000A.EAX[0:15]. | ||
| 1805 | * | ||
| 1806 | * To overcome Hyper-V bug, we accept here either a supported | ||
| 1807 | * eVMCS version or VMCS12 revision_id as valid values for first | ||
| 1808 | * u32 field of eVMCS. | ||
| 1809 | */ | ||
| 1810 | if ((vmx->nested.hv_evmcs->revision_id != KVM_EVMCS_VERSION) && | ||
| 1811 | (vmx->nested.hv_evmcs->revision_id != VMCS12_REVISION)) { | ||
| 1812 | nested_release_evmcs(vcpu); | ||
| 1813 | return 0; | ||
| 1814 | } | ||
| 1815 | |||
| 1816 | vmx->nested.dirty_vmcs12 = true; | ||
| 1817 | /* | ||
| 1818 | * As we keep L2 state for one guest only 'hv_clean_fields' mask | ||
| 1819 | * can't be used when we switch between them. Reset it here for | ||
| 1820 | * simplicity. | ||
| 1821 | */ | ||
| 1822 | vmx->nested.hv_evmcs->hv_clean_fields &= | ||
| 1823 | ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; | ||
| 1824 | vmx->nested.hv_evmcs_vmptr = assist_page.current_nested_vmcs; | ||
| 1825 | |||
| 1826 | /* | ||
| 1827 | * Unlike normal vmcs12, enlightened vmcs12 is not fully | ||
| 1828 | * reloaded from guest's memory (read only fields, fields not | ||
| 1829 | * present in struct hv_enlightened_vmcs, ...). Make sure there | ||
| 1830 | * are no leftovers. | ||
| 1831 | */ | ||
| 1832 | if (from_launch) { | ||
| 1833 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); | ||
| 1834 | memset(vmcs12, 0, sizeof(*vmcs12)); | ||
| 1835 | vmcs12->hdr.revision_id = VMCS12_REVISION; | ||
| 1836 | } | ||
| 1837 | |||
| 1838 | } | ||
| 1839 | return 1; | ||
| 1840 | } | ||
| 1841 | |||
| 1842 | void nested_sync_from_vmcs12(struct kvm_vcpu *vcpu) | ||
| 1843 | { | ||
| 1844 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 1845 | |||
| 1846 | /* | ||
| 1847 | * hv_evmcs may end up being not mapped after migration (when | ||
| 1848 | * L2 was running), map it here to make sure vmcs12 changes are | ||
| 1849 | * properly reflected. | ||
| 1850 | */ | ||
| 1851 | if (vmx->nested.enlightened_vmcs_enabled && !vmx->nested.hv_evmcs) | ||
| 1852 | nested_vmx_handle_enlightened_vmptrld(vcpu, false); | ||
| 1853 | |||
| 1854 | if (vmx->nested.hv_evmcs) { | ||
| 1855 | copy_vmcs12_to_enlightened(vmx); | ||
| 1856 | /* All fields are clean */ | ||
| 1857 | vmx->nested.hv_evmcs->hv_clean_fields |= | ||
| 1858 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; | ||
| 1859 | } else { | ||
| 1860 | copy_vmcs12_to_shadow(vmx); | ||
| 1861 | } | ||
| 1862 | |||
| 1863 | vmx->nested.need_vmcs12_sync = false; | ||
| 1864 | } | ||
| 1865 | |||
| 1866 | static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer) | ||
| 1867 | { | ||
| 1868 | struct vcpu_vmx *vmx = | ||
| 1869 | container_of(timer, struct vcpu_vmx, nested.preemption_timer); | ||
| 1870 | |||
| 1871 | vmx->nested.preemption_timer_expired = true; | ||
| 1872 | kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu); | ||
| 1873 | kvm_vcpu_kick(&vmx->vcpu); | ||
| 1874 | |||
| 1875 | return HRTIMER_NORESTART; | ||
| 1876 | } | ||
| 1877 | |||
| 1878 | static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu) | ||
| 1879 | { | ||
| 1880 | u64 preemption_timeout = get_vmcs12(vcpu)->vmx_preemption_timer_value; | ||
| 1881 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 1882 | |||
| 1883 | /* | ||
| 1884 | * A timer value of zero is architecturally guaranteed to cause | ||
| 1885 | * a VMExit prior to executing any instructions in the guest. | ||
| 1886 | */ | ||
| 1887 | if (preemption_timeout == 0) { | ||
| 1888 | vmx_preemption_timer_fn(&vmx->nested.preemption_timer); | ||
| 1889 | return; | ||
| 1890 | } | ||
| 1891 | |||
| 1892 | if (vcpu->arch.virtual_tsc_khz == 0) | ||
| 1893 | return; | ||
| 1894 | |||
| 1895 | preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; | ||
| 1896 | preemption_timeout *= 1000000; | ||
| 1897 | do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz); | ||
| 1898 | hrtimer_start(&vmx->nested.preemption_timer, | ||
| 1899 | ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL); | ||
| 1900 | } | ||
| 1901 | |||
| 1902 | static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) | ||
| 1903 | { | ||
| 1904 | if (vmx->nested.nested_run_pending && | ||
| 1905 | (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) | ||
| 1906 | return vmcs12->guest_ia32_efer; | ||
| 1907 | else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) | ||
| 1908 | return vmx->vcpu.arch.efer | (EFER_LMA | EFER_LME); | ||
| 1909 | else | ||
| 1910 | return vmx->vcpu.arch.efer & ~(EFER_LMA | EFER_LME); | ||
| 1911 | } | ||
| 1912 | |||
| 1913 | static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx) | ||
| 1914 | { | ||
| 1915 | /* | ||
| 1916 | * If vmcs02 hasn't been initialized, set the constant vmcs02 state | ||
| 1917 | * according to L0's settings (vmcs12 is irrelevant here). Host | ||
| 1918 | * fields that come from L0 and are not constant, e.g. HOST_CR3, | ||
| 1919 | * will be set as needed prior to VMLAUNCH/VMRESUME. | ||
| 1920 | */ | ||
| 1921 | if (vmx->nested.vmcs02_initialized) | ||
| 1922 | return; | ||
| 1923 | vmx->nested.vmcs02_initialized = true; | ||
| 1924 | |||
| 1925 | /* | ||
| 1926 | * We don't care what the EPTP value is we just need to guarantee | ||
| 1927 | * it's valid so we don't get a false positive when doing early | ||
| 1928 | * consistency checks. | ||
| 1929 | */ | ||
| 1930 | if (enable_ept && nested_early_check) | ||
| 1931 | vmcs_write64(EPT_POINTER, construct_eptp(&vmx->vcpu, 0)); | ||
| 1932 | |||
| 1933 | /* All VMFUNCs are currently emulated through L0 vmexits. */ | ||
| 1934 | if (cpu_has_vmx_vmfunc()) | ||
| 1935 | vmcs_write64(VM_FUNCTION_CONTROL, 0); | ||
| 1936 | |||
| 1937 | if (cpu_has_vmx_posted_intr()) | ||
| 1938 | vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR); | ||
| 1939 | |||
| 1940 | if (cpu_has_vmx_msr_bitmap()) | ||
| 1941 | vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap)); | ||
| 1942 | |||
| 1943 | if (enable_pml) | ||
| 1944 | vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); | ||
| 1945 | |||
| 1946 | /* | ||
| 1947 | * Set the MSR load/store lists to match L0's settings. Only the | ||
| 1948 | * addresses are constant (for vmcs02), the counts can change based | ||
| 1949 | * on L2's behavior, e.g. switching to/from long mode. | ||
| 1950 | */ | ||
| 1951 | vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); | ||
| 1952 | vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val)); | ||
| 1953 | vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val)); | ||
| 1954 | |||
| 1955 | vmx_set_constant_host_state(vmx); | ||
| 1956 | } | ||
| 1957 | |||
| 1958 | static void prepare_vmcs02_early_full(struct vcpu_vmx *vmx, | ||
| 1959 | struct vmcs12 *vmcs12) | ||
| 1960 | { | ||
| 1961 | prepare_vmcs02_constant_state(vmx); | ||
| 1962 | |||
| 1963 | vmcs_write64(VMCS_LINK_POINTER, -1ull); | ||
| 1964 | |||
| 1965 | if (enable_vpid) { | ||
| 1966 | if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) | ||
| 1967 | vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02); | ||
| 1968 | else | ||
| 1969 | vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); | ||
| 1970 | } | ||
| 1971 | } | ||
| 1972 | |||
| 1973 | static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) | ||
| 1974 | { | ||
| 1975 | u32 exec_control, vmcs12_exec_ctrl; | ||
| 1976 | u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12); | ||
| 1977 | |||
| 1978 | if (vmx->nested.dirty_vmcs12 || vmx->nested.hv_evmcs) | ||
| 1979 | prepare_vmcs02_early_full(vmx, vmcs12); | ||
| 1980 | |||
| 1981 | /* | ||
| 1982 | * HOST_RSP is normally set correctly in vmx_vcpu_run() just before | ||
| 1983 | * entry, but only if the current (host) sp changed from the value | ||
| 1984 | * we wrote last (vmx->host_rsp). This cache is no longer relevant | ||
| 1985 | * if we switch vmcs, and rather than hold a separate cache per vmcs, | ||
| 1986 | * here we just force the write to happen on entry. host_rsp will | ||
| 1987 | * also be written unconditionally by nested_vmx_check_vmentry_hw() | ||
| 1988 | * if we are doing early consistency checks via hardware. | ||
| 1989 | */ | ||
| 1990 | vmx->host_rsp = 0; | ||
| 1991 | |||
| 1992 | /* | ||
| 1993 | * PIN CONTROLS | ||
| 1994 | */ | ||
| 1995 | exec_control = vmcs12->pin_based_vm_exec_control; | ||
| 1996 | |||
| 1997 | /* Preemption timer setting is computed directly in vmx_vcpu_run. */ | ||
| 1998 | exec_control |= vmcs_config.pin_based_exec_ctrl; | ||
| 1999 | exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; | ||
| 2000 | vmx->loaded_vmcs->hv_timer_armed = false; | ||
| 2001 | |||
| 2002 | /* Posted interrupts setting is only taken from vmcs12. */ | ||
| 2003 | if (nested_cpu_has_posted_intr(vmcs12)) { | ||
| 2004 | vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv; | ||
| 2005 | vmx->nested.pi_pending = false; | ||
| 2006 | } else { | ||
| 2007 | exec_control &= ~PIN_BASED_POSTED_INTR; | ||
| 2008 | } | ||
| 2009 | vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, exec_control); | ||
| 2010 | |||
| 2011 | /* | ||
| 2012 | * EXEC CONTROLS | ||
| 2013 | */ | ||
| 2014 | exec_control = vmx_exec_control(vmx); /* L0's desires */ | ||
| 2015 | exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING; | ||
| 2016 | exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING; | ||
| 2017 | exec_control &= ~CPU_BASED_TPR_SHADOW; | ||
| 2018 | exec_control |= vmcs12->cpu_based_vm_exec_control; | ||
| 2019 | |||
| 2020 | /* | ||
| 2021 | * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR. Later, if | ||
| 2022 | * nested_get_vmcs12_pages can't fix it up, the illegal value | ||
| 2023 | * will result in a VM entry failure. | ||
| 2024 | */ | ||
| 2025 | if (exec_control & CPU_BASED_TPR_SHADOW) { | ||
| 2026 | vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull); | ||
| 2027 | vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold); | ||
| 2028 | } else { | ||
| 2029 | #ifdef CONFIG_X86_64 | ||
| 2030 | exec_control |= CPU_BASED_CR8_LOAD_EXITING | | ||
| 2031 | CPU_BASED_CR8_STORE_EXITING; | ||
| 2032 | #endif | ||
| 2033 | } | ||
| 2034 | |||
| 2035 | /* | ||
| 2036 | * A vmexit (to either L1 hypervisor or L0 userspace) is always needed | ||
| 2037 | * for I/O port accesses. | ||
| 2038 | */ | ||
| 2039 | exec_control &= ~CPU_BASED_USE_IO_BITMAPS; | ||
| 2040 | exec_control |= CPU_BASED_UNCOND_IO_EXITING; | ||
| 2041 | vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control); | ||
| 2042 | |||
| 2043 | /* | ||
| 2044 | * SECONDARY EXEC CONTROLS | ||
| 2045 | */ | ||
| 2046 | if (cpu_has_secondary_exec_ctrls()) { | ||
| 2047 | exec_control = vmx->secondary_exec_control; | ||
| 2048 | |||
| 2049 | /* Take the following fields only from vmcs12 */ | ||
| 2050 | exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | | ||
| 2051 | SECONDARY_EXEC_ENABLE_INVPCID | | ||
| 2052 | SECONDARY_EXEC_RDTSCP | | ||
| 2053 | SECONDARY_EXEC_XSAVES | | ||
| 2054 | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | | ||
| 2055 | SECONDARY_EXEC_APIC_REGISTER_VIRT | | ||
| 2056 | SECONDARY_EXEC_ENABLE_VMFUNC); | ||
| 2057 | if (nested_cpu_has(vmcs12, | ||
| 2058 | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) { | ||
| 2059 | vmcs12_exec_ctrl = vmcs12->secondary_vm_exec_control & | ||
| 2060 | ~SECONDARY_EXEC_ENABLE_PML; | ||
| 2061 | exec_control |= vmcs12_exec_ctrl; | ||
| 2062 | } | ||
| 2063 | |||
| 2064 | /* VMCS shadowing for L2 is emulated for now */ | ||
| 2065 | exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; | ||
| 2066 | |||
| 2067 | if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) | ||
| 2068 | vmcs_write16(GUEST_INTR_STATUS, | ||
| 2069 | vmcs12->guest_intr_status); | ||
| 2070 | |||
| 2071 | /* | ||
| 2072 | * Write an illegal value to APIC_ACCESS_ADDR. Later, | ||
| 2073 | * nested_get_vmcs12_pages will either fix it up or | ||
| 2074 | * remove the VM execution control. | ||
| 2075 | */ | ||
| 2076 | if (exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) | ||
| 2077 | vmcs_write64(APIC_ACCESS_ADDR, -1ull); | ||
| 2078 | |||
| 2079 | if (exec_control & SECONDARY_EXEC_ENCLS_EXITING) | ||
| 2080 | vmcs_write64(ENCLS_EXITING_BITMAP, -1ull); | ||
| 2081 | |||
| 2082 | vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); | ||
| 2083 | } | ||
| 2084 | |||
| 2085 | /* | ||
| 2086 | * ENTRY CONTROLS | ||
| 2087 | * | ||
| 2088 | * vmcs12's VM_{ENTRY,EXIT}_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE | ||
| 2089 | * are emulated by vmx_set_efer() in prepare_vmcs02(), but speculate | ||
| 2090 | * on the related bits (if supported by the CPU) in the hope that | ||
| 2091 | * we can avoid VMWrites during vmx_set_efer(). | ||
| 2092 | */ | ||
| 2093 | exec_control = (vmcs12->vm_entry_controls | vmx_vmentry_ctrl()) & | ||
| 2094 | ~VM_ENTRY_IA32E_MODE & ~VM_ENTRY_LOAD_IA32_EFER; | ||
| 2095 | if (cpu_has_load_ia32_efer()) { | ||
| 2096 | if (guest_efer & EFER_LMA) | ||
| 2097 | exec_control |= VM_ENTRY_IA32E_MODE; | ||
| 2098 | if (guest_efer != host_efer) | ||
| 2099 | exec_control |= VM_ENTRY_LOAD_IA32_EFER; | ||
| 2100 | } | ||
| 2101 | vm_entry_controls_init(vmx, exec_control); | ||
| 2102 | |||
| 2103 | /* | ||
| 2104 | * EXIT CONTROLS | ||
| 2105 | * | ||
| 2106 | * L2->L1 exit controls are emulated - the hardware exit is to L0 so | ||
| 2107 | * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER | ||
| 2108 | * bits may be modified by vmx_set_efer() in prepare_vmcs02(). | ||
| 2109 | */ | ||
| 2110 | exec_control = vmx_vmexit_ctrl(); | ||
| 2111 | if (cpu_has_load_ia32_efer() && guest_efer != host_efer) | ||
| 2112 | exec_control |= VM_EXIT_LOAD_IA32_EFER; | ||
| 2113 | vm_exit_controls_init(vmx, exec_control); | ||
| 2114 | |||
| 2115 | /* | ||
| 2116 | * Conceptually we want to copy the PML address and index from | ||
| 2117 | * vmcs01 here, and then back to vmcs01 on nested vmexit. But, | ||
| 2118 | * since we always flush the log on each vmexit and never change | ||
| 2119 | * the PML address (once set), this happens to be equivalent to | ||
| 2120 | * simply resetting the index in vmcs02. | ||
| 2121 | */ | ||
| 2122 | if (enable_pml) | ||
| 2123 | vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); | ||
| 2124 | |||
| 2125 | /* | ||
| 2126 | * Interrupt/Exception Fields | ||
| 2127 | */ | ||
| 2128 | if (vmx->nested.nested_run_pending) { | ||
| 2129 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, | ||
| 2130 | vmcs12->vm_entry_intr_info_field); | ||
| 2131 | vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, | ||
| 2132 | vmcs12->vm_entry_exception_error_code); | ||
| 2133 | vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, | ||
| 2134 | vmcs12->vm_entry_instruction_len); | ||
| 2135 | vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, | ||
| 2136 | vmcs12->guest_interruptibility_info); | ||
| 2137 | vmx->loaded_vmcs->nmi_known_unmasked = | ||
| 2138 | !(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI); | ||
| 2139 | } else { | ||
| 2140 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); | ||
| 2141 | } | ||
| 2142 | } | ||
| 2143 | |||
| 2144 | static void prepare_vmcs02_full(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) | ||
| 2145 | { | ||
| 2146 | struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs; | ||
| 2147 | |||
| 2148 | if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & | ||
| 2149 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) { | ||
| 2150 | vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector); | ||
| 2151 | vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector); | ||
| 2152 | vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector); | ||
| 2153 | vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector); | ||
| 2154 | vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector); | ||
| 2155 | vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector); | ||
| 2156 | vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector); | ||
| 2157 | vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector); | ||
| 2158 | vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit); | ||
| 2159 | vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit); | ||
| 2160 | vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit); | ||
| 2161 | vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit); | ||
| 2162 | vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit); | ||
| 2163 | vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit); | ||
| 2164 | vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit); | ||
| 2165 | vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit); | ||
| 2166 | vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit); | ||
| 2167 | vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit); | ||
| 2168 | vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes); | ||
| 2169 | vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes); | ||
| 2170 | vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes); | ||
| 2171 | vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes); | ||
| 2172 | vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes); | ||
| 2173 | vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes); | ||
| 2174 | vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base); | ||
| 2175 | vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base); | ||
| 2176 | vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base); | ||
| 2177 | vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base); | ||
| 2178 | vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base); | ||
| 2179 | vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base); | ||
| 2180 | vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base); | ||
| 2181 | vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base); | ||
| 2182 | vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base); | ||
| 2183 | vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base); | ||
| 2184 | } | ||
| 2185 | |||
| 2186 | if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & | ||
| 2187 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1)) { | ||
| 2188 | vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs); | ||
| 2189 | vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, | ||
| 2190 | vmcs12->guest_pending_dbg_exceptions); | ||
| 2191 | vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp); | ||
| 2192 | vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip); | ||
| 2193 | |||
| 2194 | /* | ||
| 2195 | * L1 may access the L2's PDPTR, so save them to construct | ||
| 2196 | * vmcs12 | ||
| 2197 | */ | ||
| 2198 | if (enable_ept) { | ||
| 2199 | vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); | ||
| 2200 | vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); | ||
| 2201 | vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); | ||
| 2202 | vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); | ||
| 2203 | } | ||
| 2204 | } | ||
| 2205 | |||
| 2206 | if (nested_cpu_has_xsaves(vmcs12)) | ||
| 2207 | vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap); | ||
| 2208 | |||
| 2209 | /* | ||
| 2210 | * Whether page-faults are trapped is determined by a combination of | ||
| 2211 | * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF. | ||
| 2212 | * If enable_ept, L0 doesn't care about page faults and we should | ||
| 2213 | * set all of these to L1's desires. However, if !enable_ept, L0 does | ||
| 2214 | * care about (at least some) page faults, and because it is not easy | ||
| 2215 | * (if at all possible?) to merge L0 and L1's desires, we simply ask | ||
| 2216 | * to exit on each and every L2 page fault. This is done by setting | ||
| 2217 | * MASK=MATCH=0 and (see below) EB.PF=1. | ||
| 2218 | * Note that below we don't need special code to set EB.PF beyond the | ||
| 2219 | * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept, | ||
| 2220 | * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when | ||
| 2221 | * !enable_ept, EB.PF is 1, so the "or" will always be 1. | ||
| 2222 | */ | ||
| 2223 | vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, | ||
| 2224 | enable_ept ? vmcs12->page_fault_error_code_mask : 0); | ||
| 2225 | vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, | ||
| 2226 | enable_ept ? vmcs12->page_fault_error_code_match : 0); | ||
| 2227 | |||
| 2228 | if (cpu_has_vmx_apicv()) { | ||
| 2229 | vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0); | ||
| 2230 | vmcs_write64(EOI_EXIT_BITMAP1, vmcs12->eoi_exit_bitmap1); | ||
| 2231 | vmcs_write64(EOI_EXIT_BITMAP2, vmcs12->eoi_exit_bitmap2); | ||
| 2232 | vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3); | ||
| 2233 | } | ||
| 2234 | |||
| 2235 | vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); | ||
| 2236 | vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); | ||
| 2237 | |||
| 2238 | set_cr4_guest_host_mask(vmx); | ||
| 2239 | |||
| 2240 | if (kvm_mpx_supported()) { | ||
| 2241 | if (vmx->nested.nested_run_pending && | ||
| 2242 | (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) | ||
| 2243 | vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs); | ||
| 2244 | else | ||
| 2245 | vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs); | ||
| 2246 | } | ||
| 2247 | } | ||
| 2248 | |||
| 2249 | /* | ||
| 2250 | * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested | ||
| 2251 | * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it | ||
| 2252 | * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2 | ||
| 2253 | * guest in a way that will both be appropriate to L1's requests, and our | ||
| 2254 | * needs. In addition to modifying the active vmcs (which is vmcs02), this | ||
| 2255 | * function also has additional necessary side-effects, like setting various | ||
| 2256 | * vcpu->arch fields. | ||
| 2257 | * Returns 0 on success, 1 on failure. Invalid state exit qualification code | ||
| 2258 | * is assigned to entry_failure_code on failure. | ||
| 2259 | */ | ||
| 2260 | static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, | ||
| 2261 | u32 *entry_failure_code) | ||
| 2262 | { | ||
| 2263 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 2264 | struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs; | ||
| 2265 | |||
| 2266 | if (vmx->nested.dirty_vmcs12 || vmx->nested.hv_evmcs) { | ||
| 2267 | prepare_vmcs02_full(vmx, vmcs12); | ||
| 2268 | vmx->nested.dirty_vmcs12 = false; | ||
| 2269 | } | ||
| 2270 | |||
| 2271 | /* | ||
| 2272 | * First, the fields that are shadowed. This must be kept in sync | ||
| 2273 | * with vmcs_shadow_fields.h. | ||
| 2274 | */ | ||
| 2275 | if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & | ||
| 2276 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) { | ||
| 2277 | vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes); | ||
| 2278 | vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes); | ||
| 2279 | } | ||
| 2280 | |||
| 2281 | if (vmx->nested.nested_run_pending && | ||
| 2282 | (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) { | ||
| 2283 | kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); | ||
| 2284 | vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl); | ||
| 2285 | } else { | ||
| 2286 | kvm_set_dr(vcpu, 7, vcpu->arch.dr7); | ||
| 2287 | vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl); | ||
| 2288 | } | ||
| 2289 | vmx_set_rflags(vcpu, vmcs12->guest_rflags); | ||
| 2290 | |||
| 2291 | vmx->nested.preemption_timer_expired = false; | ||
| 2292 | if (nested_cpu_has_preemption_timer(vmcs12)) | ||
| 2293 | vmx_start_preemption_timer(vcpu); | ||
| 2294 | |||
| 2295 | /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the | ||
| 2296 | * bitwise-or of what L1 wants to trap for L2, and what we want to | ||
| 2297 | * trap. Note that CR0.TS also needs updating - we do this later. | ||
| 2298 | */ | ||
| 2299 | update_exception_bitmap(vcpu); | ||
| 2300 | vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask; | ||
| 2301 | vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); | ||
| 2302 | |||
| 2303 | if (vmx->nested.nested_run_pending && | ||
| 2304 | (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) { | ||
| 2305 | vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat); | ||
| 2306 | vcpu->arch.pat = vmcs12->guest_ia32_pat; | ||
| 2307 | } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { | ||
| 2308 | vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); | ||
| 2309 | } | ||
| 2310 | |||
| 2311 | vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); | ||
| 2312 | |||
| 2313 | if (kvm_has_tsc_control) | ||
| 2314 | decache_tsc_multiplier(vmx); | ||
| 2315 | |||
| 2316 | if (enable_vpid) { | ||
| 2317 | /* | ||
| 2318 | * There is no direct mapping between vpid02 and vpid12, the | ||
| 2319 | * vpid02 is per-vCPU for L0 and reused while the value of | ||
| 2320 | * vpid12 is changed w/ one invvpid during nested vmentry. | ||
| 2321 | * The vpid12 is allocated by L1 for L2, so it will not | ||
| 2322 | * influence global bitmap(for vpid01 and vpid02 allocation) | ||
| 2323 | * even if spawn a lot of nested vCPUs. | ||
| 2324 | */ | ||
| 2325 | if (nested_cpu_has_vpid(vmcs12) && nested_has_guest_tlb_tag(vcpu)) { | ||
| 2326 | if (vmcs12->virtual_processor_id != vmx->nested.last_vpid) { | ||
| 2327 | vmx->nested.last_vpid = vmcs12->virtual_processor_id; | ||
| 2328 | __vmx_flush_tlb(vcpu, nested_get_vpid02(vcpu), false); | ||
| 2329 | } | ||
| 2330 | } else { | ||
| 2331 | /* | ||
| 2332 | * If L1 use EPT, then L0 needs to execute INVEPT on | ||
| 2333 | * EPTP02 instead of EPTP01. Therefore, delay TLB | ||
| 2334 | * flush until vmcs02->eptp is fully updated by | ||
| 2335 | * KVM_REQ_LOAD_CR3. Note that this assumes | ||
| 2336 | * KVM_REQ_TLB_FLUSH is evaluated after | ||
| 2337 | * KVM_REQ_LOAD_CR3 in vcpu_enter_guest(). | ||
| 2338 | */ | ||
| 2339 | kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); | ||
| 2340 | } | ||
| 2341 | } | ||
| 2342 | |||
| 2343 | if (nested_cpu_has_ept(vmcs12)) | ||
| 2344 | nested_ept_init_mmu_context(vcpu); | ||
| 2345 | else if (nested_cpu_has2(vmcs12, | ||
| 2346 | SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) | ||
| 2347 | vmx_flush_tlb(vcpu, true); | ||
| 2348 | |||
| 2349 | /* | ||
| 2350 | * This sets GUEST_CR0 to vmcs12->guest_cr0, possibly modifying those | ||
| 2351 | * bits which we consider mandatory enabled. | ||
| 2352 | * The CR0_READ_SHADOW is what L2 should have expected to read given | ||
| 2353 | * the specifications by L1; It's not enough to take | ||
| 2354 | * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we we | ||
| 2355 | * have more bits than L1 expected. | ||
| 2356 | */ | ||
| 2357 | vmx_set_cr0(vcpu, vmcs12->guest_cr0); | ||
| 2358 | vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12)); | ||
| 2359 | |||
| 2360 | vmx_set_cr4(vcpu, vmcs12->guest_cr4); | ||
| 2361 | vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12)); | ||
| 2362 | |||
| 2363 | vcpu->arch.efer = nested_vmx_calc_efer(vmx, vmcs12); | ||
| 2364 | /* Note: may modify VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */ | ||
| 2365 | vmx_set_efer(vcpu, vcpu->arch.efer); | ||
| 2366 | |||
| 2367 | /* | ||
| 2368 | * Guest state is invalid and unrestricted guest is disabled, | ||
| 2369 | * which means L1 attempted VMEntry to L2 with invalid state. | ||
| 2370 | * Fail the VMEntry. | ||
| 2371 | */ | ||
| 2372 | if (vmx->emulation_required) { | ||
| 2373 | *entry_failure_code = ENTRY_FAIL_DEFAULT; | ||
| 2374 | return 1; | ||
| 2375 | } | ||
| 2376 | |||
| 2377 | /* Shadow page tables on either EPT or shadow page tables. */ | ||
| 2378 | if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12), | ||
| 2379 | entry_failure_code)) | ||
| 2380 | return 1; | ||
| 2381 | |||
| 2382 | if (!enable_ept) | ||
| 2383 | vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested; | ||
| 2384 | |||
| 2385 | kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->guest_rsp); | ||
| 2386 | kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip); | ||
| 2387 | return 0; | ||
| 2388 | } | ||
| 2389 | |||
| 2390 | static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12) | ||
| 2391 | { | ||
| 2392 | if (!nested_cpu_has_nmi_exiting(vmcs12) && | ||
| 2393 | nested_cpu_has_virtual_nmis(vmcs12)) | ||
| 2394 | return -EINVAL; | ||
| 2395 | |||
| 2396 | if (!nested_cpu_has_virtual_nmis(vmcs12) && | ||
| 2397 | nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING)) | ||
| 2398 | return -EINVAL; | ||
| 2399 | |||
| 2400 | return 0; | ||
| 2401 | } | ||
| 2402 | |||
| 2403 | static bool valid_ept_address(struct kvm_vcpu *vcpu, u64 address) | ||
| 2404 | { | ||
| 2405 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 2406 | int maxphyaddr = cpuid_maxphyaddr(vcpu); | ||
| 2407 | |||
| 2408 | /* Check for memory type validity */ | ||
| 2409 | switch (address & VMX_EPTP_MT_MASK) { | ||
| 2410 | case VMX_EPTP_MT_UC: | ||
| 2411 | if (!(vmx->nested.msrs.ept_caps & VMX_EPTP_UC_BIT)) | ||
| 2412 | return false; | ||
| 2413 | break; | ||
| 2414 | case VMX_EPTP_MT_WB: | ||
| 2415 | if (!(vmx->nested.msrs.ept_caps & VMX_EPTP_WB_BIT)) | ||
| 2416 | return false; | ||
| 2417 | break; | ||
| 2418 | default: | ||
| 2419 | return false; | ||
| 2420 | } | ||
| 2421 | |||
| 2422 | /* only 4 levels page-walk length are valid */ | ||
| 2423 | if ((address & VMX_EPTP_PWL_MASK) != VMX_EPTP_PWL_4) | ||
| 2424 | return false; | ||
| 2425 | |||
| 2426 | /* Reserved bits should not be set */ | ||
| 2427 | if (address >> maxphyaddr || ((address >> 7) & 0x1f)) | ||
| 2428 | return false; | ||
| 2429 | |||
| 2430 | /* AD, if set, should be supported */ | ||
| 2431 | if (address & VMX_EPTP_AD_ENABLE_BIT) { | ||
| 2432 | if (!(vmx->nested.msrs.ept_caps & VMX_EPT_AD_BIT)) | ||
| 2433 | return false; | ||
| 2434 | } | ||
| 2435 | |||
| 2436 | return true; | ||
| 2437 | } | ||
| 2438 | |||
| 2439 | /* | ||
| 2440 | * Checks related to VM-Execution Control Fields | ||
| 2441 | */ | ||
| 2442 | static int nested_check_vm_execution_controls(struct kvm_vcpu *vcpu, | ||
| 2443 | struct vmcs12 *vmcs12) | ||
| 2444 | { | ||
| 2445 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 2446 | |||
| 2447 | if (!vmx_control_verify(vmcs12->pin_based_vm_exec_control, | ||
| 2448 | vmx->nested.msrs.pinbased_ctls_low, | ||
| 2449 | vmx->nested.msrs.pinbased_ctls_high) || | ||
| 2450 | !vmx_control_verify(vmcs12->cpu_based_vm_exec_control, | ||
| 2451 | vmx->nested.msrs.procbased_ctls_low, | ||
| 2452 | vmx->nested.msrs.procbased_ctls_high)) | ||
| 2453 | return -EINVAL; | ||
| 2454 | |||
| 2455 | if (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) && | ||
| 2456 | !vmx_control_verify(vmcs12->secondary_vm_exec_control, | ||
| 2457 | vmx->nested.msrs.secondary_ctls_low, | ||
| 2458 | vmx->nested.msrs.secondary_ctls_high)) | ||
| 2459 | return -EINVAL; | ||
| 2460 | |||
| 2461 | if (vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu) || | ||
| 2462 | nested_vmx_check_io_bitmap_controls(vcpu, vmcs12) || | ||
| 2463 | nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12) || | ||
| 2464 | nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12) || | ||
| 2465 | nested_vmx_check_apic_access_controls(vcpu, vmcs12) || | ||
| 2466 | nested_vmx_check_apicv_controls(vcpu, vmcs12) || | ||
| 2467 | nested_vmx_check_nmi_controls(vmcs12) || | ||
| 2468 | nested_vmx_check_pml_controls(vcpu, vmcs12) || | ||
| 2469 | nested_vmx_check_unrestricted_guest_controls(vcpu, vmcs12) || | ||
| 2470 | nested_vmx_check_mode_based_ept_exec_controls(vcpu, vmcs12) || | ||
| 2471 | nested_vmx_check_shadow_vmcs_controls(vcpu, vmcs12) || | ||
| 2472 | (nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id)) | ||
| 2473 | return -EINVAL; | ||
| 2474 | |||
| 2475 | if (nested_cpu_has_ept(vmcs12) && | ||
| 2476 | !valid_ept_address(vcpu, vmcs12->ept_pointer)) | ||
| 2477 | return -EINVAL; | ||
| 2478 | |||
| 2479 | if (nested_cpu_has_vmfunc(vmcs12)) { | ||
| 2480 | if (vmcs12->vm_function_control & | ||
| 2481 | ~vmx->nested.msrs.vmfunc_controls) | ||
| 2482 | return -EINVAL; | ||
| 2483 | |||
| 2484 | if (nested_cpu_has_eptp_switching(vmcs12)) { | ||
| 2485 | if (!nested_cpu_has_ept(vmcs12) || | ||
| 2486 | !page_address_valid(vcpu, vmcs12->eptp_list_address)) | ||
| 2487 | return -EINVAL; | ||
| 2488 | } | ||
| 2489 | } | ||
| 2490 | |||
| 2491 | return 0; | ||
| 2492 | } | ||
| 2493 | |||
| 2494 | /* | ||
| 2495 | * Checks related to VM-Exit Control Fields | ||
| 2496 | */ | ||
| 2497 | static int nested_check_vm_exit_controls(struct kvm_vcpu *vcpu, | ||
| 2498 | struct vmcs12 *vmcs12) | ||
| 2499 | { | ||
| 2500 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 2501 | |||
| 2502 | if (!vmx_control_verify(vmcs12->vm_exit_controls, | ||
| 2503 | vmx->nested.msrs.exit_ctls_low, | ||
| 2504 | vmx->nested.msrs.exit_ctls_high) || | ||
| 2505 | nested_vmx_check_exit_msr_switch_controls(vcpu, vmcs12)) | ||
| 2506 | return -EINVAL; | ||
| 2507 | |||
| 2508 | return 0; | ||
| 2509 | } | ||
| 2510 | |||
| 2511 | /* | ||
| 2512 | * Checks related to VM-Entry Control Fields | ||
| 2513 | */ | ||
| 2514 | static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu, | ||
| 2515 | struct vmcs12 *vmcs12) | ||
| 2516 | { | ||
| 2517 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 2518 | |||
| 2519 | if (!vmx_control_verify(vmcs12->vm_entry_controls, | ||
| 2520 | vmx->nested.msrs.entry_ctls_low, | ||
| 2521 | vmx->nested.msrs.entry_ctls_high)) | ||
| 2522 | return -EINVAL; | ||
| 2523 | |||
| 2524 | /* | ||
| 2525 | * From the Intel SDM, volume 3: | ||
| 2526 | * Fields relevant to VM-entry event injection must be set properly. | ||
| 2527 | * These fields are the VM-entry interruption-information field, the | ||
| 2528 | * VM-entry exception error code, and the VM-entry instruction length. | ||
| 2529 | */ | ||
| 2530 | if (vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) { | ||
| 2531 | u32 intr_info = vmcs12->vm_entry_intr_info_field; | ||
| 2532 | u8 vector = intr_info & INTR_INFO_VECTOR_MASK; | ||
| 2533 | u32 intr_type = intr_info & INTR_INFO_INTR_TYPE_MASK; | ||
| 2534 | bool has_error_code = intr_info & INTR_INFO_DELIVER_CODE_MASK; | ||
| 2535 | bool should_have_error_code; | ||
| 2536 | bool urg = nested_cpu_has2(vmcs12, | ||
| 2537 | SECONDARY_EXEC_UNRESTRICTED_GUEST); | ||
| 2538 | bool prot_mode = !urg || vmcs12->guest_cr0 & X86_CR0_PE; | ||
| 2539 | |||
| 2540 | /* VM-entry interruption-info field: interruption type */ | ||
| 2541 | if (intr_type == INTR_TYPE_RESERVED || | ||
| 2542 | (intr_type == INTR_TYPE_OTHER_EVENT && | ||
| 2543 | !nested_cpu_supports_monitor_trap_flag(vcpu))) | ||
| 2544 | return -EINVAL; | ||
| 2545 | |||
| 2546 | /* VM-entry interruption-info field: vector */ | ||
| 2547 | if ((intr_type == INTR_TYPE_NMI_INTR && vector != NMI_VECTOR) || | ||
| 2548 | (intr_type == INTR_TYPE_HARD_EXCEPTION && vector > 31) || | ||
| 2549 | (intr_type == INTR_TYPE_OTHER_EVENT && vector != 0)) | ||
| 2550 | return -EINVAL; | ||
| 2551 | |||
| 2552 | /* VM-entry interruption-info field: deliver error code */ | ||
| 2553 | should_have_error_code = | ||
| 2554 | intr_type == INTR_TYPE_HARD_EXCEPTION && prot_mode && | ||
| 2555 | x86_exception_has_error_code(vector); | ||
| 2556 | if (has_error_code != should_have_error_code) | ||
| 2557 | return -EINVAL; | ||
| 2558 | |||
| 2559 | /* VM-entry exception error code */ | ||
| 2560 | if (has_error_code && | ||
| 2561 | vmcs12->vm_entry_exception_error_code & GENMASK(31, 15)) | ||
| 2562 | return -EINVAL; | ||
| 2563 | |||
| 2564 | /* VM-entry interruption-info field: reserved bits */ | ||
| 2565 | if (intr_info & INTR_INFO_RESVD_BITS_MASK) | ||
| 2566 | return -EINVAL; | ||
| 2567 | |||
| 2568 | /* VM-entry instruction length */ | ||
| 2569 | switch (intr_type) { | ||
| 2570 | case INTR_TYPE_SOFT_EXCEPTION: | ||
| 2571 | case INTR_TYPE_SOFT_INTR: | ||
| 2572 | case INTR_TYPE_PRIV_SW_EXCEPTION: | ||
| 2573 | if ((vmcs12->vm_entry_instruction_len > 15) || | ||
| 2574 | (vmcs12->vm_entry_instruction_len == 0 && | ||
| 2575 | !nested_cpu_has_zero_length_injection(vcpu))) | ||
| 2576 | return -EINVAL; | ||
| 2577 | } | ||
| 2578 | } | ||
| 2579 | |||
| 2580 | if (nested_vmx_check_entry_msr_switch_controls(vcpu, vmcs12)) | ||
| 2581 | return -EINVAL; | ||
| 2582 | |||
| 2583 | return 0; | ||
| 2584 | } | ||
| 2585 | |||
| 2586 | /* | ||
| 2587 | * Checks related to Host Control Registers and MSRs | ||
| 2588 | */ | ||
| 2589 | static int nested_check_host_control_regs(struct kvm_vcpu *vcpu, | ||
| 2590 | struct vmcs12 *vmcs12) | ||
| 2591 | { | ||
| 2592 | bool ia32e; | ||
| 2593 | |||
| 2594 | if (!nested_host_cr0_valid(vcpu, vmcs12->host_cr0) || | ||
| 2595 | !nested_host_cr4_valid(vcpu, vmcs12->host_cr4) || | ||
| 2596 | !nested_cr3_valid(vcpu, vmcs12->host_cr3)) | ||
| 2597 | return -EINVAL; | ||
| 2598 | /* | ||
| 2599 | * If the load IA32_EFER VM-exit control is 1, bits reserved in the | ||
| 2600 | * IA32_EFER MSR must be 0 in the field for that register. In addition, | ||
| 2601 | * the values of the LMA and LME bits in the field must each be that of | ||
| 2602 | * the host address-space size VM-exit control. | ||
| 2603 | */ | ||
| 2604 | if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) { | ||
| 2605 | ia32e = (vmcs12->vm_exit_controls & | ||
| 2606 | VM_EXIT_HOST_ADDR_SPACE_SIZE) != 0; | ||
| 2607 | if (!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer) || | ||
| 2608 | ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA) || | ||
| 2609 | ia32e != !!(vmcs12->host_ia32_efer & EFER_LME)) | ||
| 2610 | return -EINVAL; | ||
| 2611 | } | ||
| 2612 | |||
| 2613 | return 0; | ||
| 2614 | } | ||
| 2615 | |||
| 2616 | /* | ||
| 2617 | * Checks related to Guest Non-register State | ||
| 2618 | */ | ||
| 2619 | static int nested_check_guest_non_reg_state(struct vmcs12 *vmcs12) | ||
| 2620 | { | ||
| 2621 | if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE && | ||
| 2622 | vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT) | ||
| 2623 | return -EINVAL; | ||
| 2624 | |||
| 2625 | return 0; | ||
| 2626 | } | ||
| 2627 | |||
| 2628 | static int nested_vmx_check_vmentry_prereqs(struct kvm_vcpu *vcpu, | ||
| 2629 | struct vmcs12 *vmcs12) | ||
| 2630 | { | ||
| 2631 | if (nested_check_vm_execution_controls(vcpu, vmcs12) || | ||
| 2632 | nested_check_vm_exit_controls(vcpu, vmcs12) || | ||
| 2633 | nested_check_vm_entry_controls(vcpu, vmcs12)) | ||
| 2634 | return VMXERR_ENTRY_INVALID_CONTROL_FIELD; | ||
| 2635 | |||
| 2636 | if (nested_check_host_control_regs(vcpu, vmcs12)) | ||
| 2637 | return VMXERR_ENTRY_INVALID_HOST_STATE_FIELD; | ||
| 2638 | |||
| 2639 | if (nested_check_guest_non_reg_state(vmcs12)) | ||
| 2640 | return VMXERR_ENTRY_INVALID_CONTROL_FIELD; | ||
| 2641 | |||
| 2642 | return 0; | ||
| 2643 | } | ||
| 2644 | |||
| 2645 | static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu, | ||
| 2646 | struct vmcs12 *vmcs12) | ||
| 2647 | { | ||
| 2648 | int r; | ||
| 2649 | struct page *page; | ||
| 2650 | struct vmcs12 *shadow; | ||
| 2651 | |||
| 2652 | if (vmcs12->vmcs_link_pointer == -1ull) | ||
| 2653 | return 0; | ||
| 2654 | |||
| 2655 | if (!page_address_valid(vcpu, vmcs12->vmcs_link_pointer)) | ||
| 2656 | return -EINVAL; | ||
| 2657 | |||
| 2658 | page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->vmcs_link_pointer); | ||
| 2659 | if (is_error_page(page)) | ||
| 2660 | return -EINVAL; | ||
| 2661 | |||
| 2662 | r = 0; | ||
| 2663 | shadow = kmap(page); | ||
| 2664 | if (shadow->hdr.revision_id != VMCS12_REVISION || | ||
| 2665 | shadow->hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12)) | ||
| 2666 | r = -EINVAL; | ||
| 2667 | kunmap(page); | ||
| 2668 | kvm_release_page_clean(page); | ||
| 2669 | return r; | ||
| 2670 | } | ||
| 2671 | |||
| 2672 | static int nested_vmx_check_vmentry_postreqs(struct kvm_vcpu *vcpu, | ||
| 2673 | struct vmcs12 *vmcs12, | ||
| 2674 | u32 *exit_qual) | ||
| 2675 | { | ||
| 2676 | bool ia32e; | ||
| 2677 | |||
| 2678 | *exit_qual = ENTRY_FAIL_DEFAULT; | ||
| 2679 | |||
| 2680 | if (!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0) || | ||
| 2681 | !nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4)) | ||
| 2682 | return 1; | ||
| 2683 | |||
| 2684 | if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) { | ||
| 2685 | *exit_qual = ENTRY_FAIL_VMCS_LINK_PTR; | ||
| 2686 | return 1; | ||
| 2687 | } | ||
| 2688 | |||
| 2689 | /* | ||
| 2690 | * If the load IA32_EFER VM-entry control is 1, the following checks | ||
| 2691 | * are performed on the field for the IA32_EFER MSR: | ||
| 2692 | * - Bits reserved in the IA32_EFER MSR must be 0. | ||
| 2693 | * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of | ||
| 2694 | * the IA-32e mode guest VM-exit control. It must also be identical | ||
| 2695 | * to bit 8 (LME) if bit 31 in the CR0 field (corresponding to | ||
| 2696 | * CR0.PG) is 1. | ||
| 2697 | */ | ||
| 2698 | if (to_vmx(vcpu)->nested.nested_run_pending && | ||
| 2699 | (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) { | ||
| 2700 | ia32e = (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) != 0; | ||
| 2701 | if (!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer) || | ||
| 2702 | ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA) || | ||
| 2703 | ((vmcs12->guest_cr0 & X86_CR0_PG) && | ||
| 2704 | ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME))) | ||
| 2705 | return 1; | ||
| 2706 | } | ||
| 2707 | |||
| 2708 | if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) && | ||
| 2709 | (is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu) || | ||
| 2710 | (vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD))) | ||
| 2711 | return 1; | ||
| 2712 | |||
| 2713 | return 0; | ||
| 2714 | } | ||
| 2715 | |||
| 2716 | static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) | ||
| 2717 | { | ||
| 2718 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 2719 | unsigned long cr3, cr4; | ||
| 2720 | |||
| 2721 | if (!nested_early_check) | ||
| 2722 | return 0; | ||
| 2723 | |||
| 2724 | if (vmx->msr_autoload.host.nr) | ||
| 2725 | vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); | ||
| 2726 | if (vmx->msr_autoload.guest.nr) | ||
| 2727 | vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); | ||
| 2728 | |||
| 2729 | preempt_disable(); | ||
| 2730 | |||
| 2731 | vmx_prepare_switch_to_guest(vcpu); | ||
| 2732 | |||
| 2733 | /* | ||
| 2734 | * Induce a consistency check VMExit by clearing bit 1 in GUEST_RFLAGS, | ||
| 2735 | * which is reserved to '1' by hardware. GUEST_RFLAGS is guaranteed to | ||
| 2736 | * be written (by preparve_vmcs02()) before the "real" VMEnter, i.e. | ||
| 2737 | * there is no need to preserve other bits or save/restore the field. | ||
| 2738 | */ | ||
| 2739 | vmcs_writel(GUEST_RFLAGS, 0); | ||
| 2740 | |||
| 2741 | cr3 = __get_current_cr3_fast(); | ||
| 2742 | if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) { | ||
| 2743 | vmcs_writel(HOST_CR3, cr3); | ||
| 2744 | vmx->loaded_vmcs->host_state.cr3 = cr3; | ||
| 2745 | } | ||
| 2746 | |||
| 2747 | cr4 = cr4_read_shadow(); | ||
| 2748 | if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) { | ||
| 2749 | vmcs_writel(HOST_CR4, cr4); | ||
| 2750 | vmx->loaded_vmcs->host_state.cr4 = cr4; | ||
| 2751 | } | ||
| 2752 | |||
| 2753 | vmx->__launched = vmx->loaded_vmcs->launched; | ||
| 2754 | |||
| 2755 | asm( | ||
| 2756 | /* Set HOST_RSP */ | ||
| 2757 | "sub $%c[wordsize], %%" _ASM_SP "\n\t" /* temporarily adjust RSP for CALL */ | ||
| 2758 | __ex("vmwrite %%" _ASM_SP ", %%" _ASM_DX) "\n\t" | ||
| 2759 | "mov %%" _ASM_SP ", %c[host_rsp](%1)\n\t" | ||
| 2760 | "add $%c[wordsize], %%" _ASM_SP "\n\t" /* un-adjust RSP */ | ||
| 2761 | |||
| 2762 | /* Check if vmlaunch or vmresume is needed */ | ||
| 2763 | "cmpl $0, %c[launched](%% " _ASM_CX")\n\t" | ||
| 2764 | |||
| 2765 | "call vmx_vmenter\n\t" | ||
| 2766 | |||
| 2767 | /* Set vmx->fail accordingly */ | ||
| 2768 | "setbe %c[fail](%% " _ASM_CX")\n\t" | ||
| 2769 | : ASM_CALL_CONSTRAINT | ||
| 2770 | : "c"(vmx), "d"((unsigned long)HOST_RSP), | ||
| 2771 | [launched]"i"(offsetof(struct vcpu_vmx, __launched)), | ||
| 2772 | [fail]"i"(offsetof(struct vcpu_vmx, fail)), | ||
| 2773 | [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)), | ||
| 2774 | [wordsize]"i"(sizeof(ulong)) | ||
| 2775 | : "rax", "cc", "memory" | ||
| 2776 | ); | ||
| 2777 | |||
| 2778 | preempt_enable(); | ||
| 2779 | |||
| 2780 | if (vmx->msr_autoload.host.nr) | ||
| 2781 | vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); | ||
| 2782 | if (vmx->msr_autoload.guest.nr) | ||
| 2783 | vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); | ||
| 2784 | |||
| 2785 | if (vmx->fail) { | ||
| 2786 | WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) != | ||
| 2787 | VMXERR_ENTRY_INVALID_CONTROL_FIELD); | ||
| 2788 | vmx->fail = 0; | ||
| 2789 | return 1; | ||
| 2790 | } | ||
| 2791 | |||
| 2792 | /* | ||
| 2793 | * VMExit clears RFLAGS.IF and DR7, even on a consistency check. | ||
| 2794 | */ | ||
| 2795 | local_irq_enable(); | ||
| 2796 | if (hw_breakpoint_active()) | ||
| 2797 | set_debugreg(__this_cpu_read(cpu_dr7), 7); | ||
| 2798 | |||
| 2799 | /* | ||
| 2800 | * A non-failing VMEntry means we somehow entered guest mode with | ||
| 2801 | * an illegal RIP, and that's just the tip of the iceberg. There | ||
| 2802 | * is no telling what memory has been modified or what state has | ||
| 2803 | * been exposed to unknown code. Hitting this all but guarantees | ||
| 2804 | * a (very critical) hardware issue. | ||
| 2805 | */ | ||
| 2806 | WARN_ON(!(vmcs_read32(VM_EXIT_REASON) & | ||
| 2807 | VMX_EXIT_REASONS_FAILED_VMENTRY)); | ||
| 2808 | |||
| 2809 | return 0; | ||
| 2810 | } | ||
| 2811 | STACK_FRAME_NON_STANDARD(nested_vmx_check_vmentry_hw); | ||
| 2812 | |||
| 2813 | |||
| 2814 | static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, | ||
| 2815 | struct vmcs12 *vmcs12); | ||
| 2816 | |||
| 2817 | static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) | ||
| 2818 | { | ||
| 2819 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); | ||
| 2820 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 2821 | struct page *page; | ||
| 2822 | u64 hpa; | ||
| 2823 | |||
| 2824 | if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { | ||
| 2825 | /* | ||
| 2826 | * Translate L1 physical address to host physical | ||
| 2827 | * address for vmcs02. Keep the page pinned, so this | ||
| 2828 | * physical address remains valid. We keep a reference | ||
| 2829 | * to it so we can release it later. | ||
| 2830 | */ | ||
| 2831 | if (vmx->nested.apic_access_page) { /* shouldn't happen */ | ||
| 2832 | kvm_release_page_dirty(vmx->nested.apic_access_page); | ||
| 2833 | vmx->nested.apic_access_page = NULL; | ||
| 2834 | } | ||
| 2835 | page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->apic_access_addr); | ||
| 2836 | /* | ||
| 2837 | * If translation failed, no matter: This feature asks | ||
| 2838 | * to exit when accessing the given address, and if it | ||
| 2839 | * can never be accessed, this feature won't do | ||
| 2840 | * anything anyway. | ||
| 2841 | */ | ||
| 2842 | if (!is_error_page(page)) { | ||
| 2843 | vmx->nested.apic_access_page = page; | ||
| 2844 | hpa = page_to_phys(vmx->nested.apic_access_page); | ||
| 2845 | vmcs_write64(APIC_ACCESS_ADDR, hpa); | ||
| 2846 | } else { | ||
| 2847 | vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, | ||
| 2848 | SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES); | ||
| 2849 | } | ||
| 2850 | } | ||
| 2851 | |||
| 2852 | if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { | ||
| 2853 | if (vmx->nested.virtual_apic_page) { /* shouldn't happen */ | ||
| 2854 | kvm_release_page_dirty(vmx->nested.virtual_apic_page); | ||
| 2855 | vmx->nested.virtual_apic_page = NULL; | ||
| 2856 | } | ||
| 2857 | page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->virtual_apic_page_addr); | ||
| 2858 | |||
| 2859 | /* | ||
| 2860 | * If translation failed, VM entry will fail because | ||
| 2861 | * prepare_vmcs02 set VIRTUAL_APIC_PAGE_ADDR to -1ull. | ||
| 2862 | * Failing the vm entry is _not_ what the processor | ||
| 2863 | * does but it's basically the only possibility we | ||
| 2864 | * have. We could still enter the guest if CR8 load | ||
| 2865 | * exits are enabled, CR8 store exits are enabled, and | ||
| 2866 | * virtualize APIC access is disabled; in this case | ||
| 2867 | * the processor would never use the TPR shadow and we | ||
| 2868 | * could simply clear the bit from the execution | ||
| 2869 | * control. But such a configuration is useless, so | ||
| 2870 | * let's keep the code simple. | ||
| 2871 | */ | ||
| 2872 | if (!is_error_page(page)) { | ||
| 2873 | vmx->nested.virtual_apic_page = page; | ||
| 2874 | hpa = page_to_phys(vmx->nested.virtual_apic_page); | ||
| 2875 | vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, hpa); | ||
| 2876 | } | ||
| 2877 | } | ||
| 2878 | |||
| 2879 | if (nested_cpu_has_posted_intr(vmcs12)) { | ||
| 2880 | if (vmx->nested.pi_desc_page) { /* shouldn't happen */ | ||
| 2881 | kunmap(vmx->nested.pi_desc_page); | ||
| 2882 | kvm_release_page_dirty(vmx->nested.pi_desc_page); | ||
| 2883 | vmx->nested.pi_desc_page = NULL; | ||
| 2884 | vmx->nested.pi_desc = NULL; | ||
| 2885 | vmcs_write64(POSTED_INTR_DESC_ADDR, -1ull); | ||
| 2886 | } | ||
| 2887 | page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->posted_intr_desc_addr); | ||
| 2888 | if (is_error_page(page)) | ||
| 2889 | return; | ||
| 2890 | vmx->nested.pi_desc_page = page; | ||
| 2891 | vmx->nested.pi_desc = kmap(vmx->nested.pi_desc_page); | ||
| 2892 | vmx->nested.pi_desc = | ||
| 2893 | (struct pi_desc *)((void *)vmx->nested.pi_desc + | ||
| 2894 | (unsigned long)(vmcs12->posted_intr_desc_addr & | ||
| 2895 | (PAGE_SIZE - 1))); | ||
| 2896 | vmcs_write64(POSTED_INTR_DESC_ADDR, | ||
| 2897 | page_to_phys(vmx->nested.pi_desc_page) + | ||
| 2898 | (unsigned long)(vmcs12->posted_intr_desc_addr & | ||
| 2899 | (PAGE_SIZE - 1))); | ||
| 2900 | } | ||
| 2901 | if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12)) | ||
| 2902 | vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL, | ||
| 2903 | CPU_BASED_USE_MSR_BITMAPS); | ||
| 2904 | else | ||
| 2905 | vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL, | ||
| 2906 | CPU_BASED_USE_MSR_BITMAPS); | ||
| 2907 | } | ||
| 2908 | |||
| 2909 | /* | ||
| 2910 | * Intel's VMX Instruction Reference specifies a common set of prerequisites | ||
| 2911 | * for running VMX instructions (except VMXON, whose prerequisites are | ||
| 2912 | * slightly different). It also specifies what exception to inject otherwise. | ||
| 2913 | * Note that many of these exceptions have priority over VM exits, so they | ||
| 2914 | * don't have to be checked again here. | ||
| 2915 | */ | ||
| 2916 | static int nested_vmx_check_permission(struct kvm_vcpu *vcpu) | ||
| 2917 | { | ||
| 2918 | if (!to_vmx(vcpu)->nested.vmxon) { | ||
| 2919 | kvm_queue_exception(vcpu, UD_VECTOR); | ||
| 2920 | return 0; | ||
| 2921 | } | ||
| 2922 | |||
| 2923 | if (vmx_get_cpl(vcpu)) { | ||
| 2924 | kvm_inject_gp(vcpu, 0); | ||
| 2925 | return 0; | ||
| 2926 | } | ||
| 2927 | |||
| 2928 | return 1; | ||
| 2929 | } | ||
| 2930 | |||
| 2931 | static u8 vmx_has_apicv_interrupt(struct kvm_vcpu *vcpu) | ||
| 2932 | { | ||
| 2933 | u8 rvi = vmx_get_rvi(); | ||
| 2934 | u8 vppr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_PROCPRI); | ||
| 2935 | |||
| 2936 | return ((rvi & 0xf0) > (vppr & 0xf0)); | ||
| 2937 | } | ||
| 2938 | |||
| 2939 | static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, | ||
| 2940 | struct vmcs12 *vmcs12); | ||
| 2941 | |||
| 2942 | /* | ||
| 2943 | * If from_vmentry is false, this is being called from state restore (either RSM | ||
| 2944 | * or KVM_SET_NESTED_STATE). Otherwise it's called from vmlaunch/vmresume. | ||
| 2945 | + * | ||
| 2946 | + * Returns: | ||
| 2947 | + * 0 - success, i.e. proceed with actual VMEnter | ||
| 2948 | + * 1 - consistency check VMExit | ||
| 2949 | + * -1 - consistency check VMFail | ||
| 2950 | */ | ||
| 2951 | int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry) | ||
| 2952 | { | ||
| 2953 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 2954 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); | ||
| 2955 | bool evaluate_pending_interrupts; | ||
| 2956 | u32 exit_reason = EXIT_REASON_INVALID_STATE; | ||
| 2957 | u32 exit_qual; | ||
| 2958 | |||
| 2959 | evaluate_pending_interrupts = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) & | ||
| 2960 | (CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_VIRTUAL_NMI_PENDING); | ||
| 2961 | if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu)) | ||
| 2962 | evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu); | ||
| 2963 | |||
| 2964 | if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) | ||
| 2965 | vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); | ||
| 2966 | if (kvm_mpx_supported() && | ||
| 2967 | !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) | ||
| 2968 | vmx->nested.vmcs01_guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); | ||
| 2969 | |||
| 2970 | vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02); | ||
| 2971 | |||
| 2972 | prepare_vmcs02_early(vmx, vmcs12); | ||
| 2973 | |||
| 2974 | if (from_vmentry) { | ||
| 2975 | nested_get_vmcs12_pages(vcpu); | ||
| 2976 | |||
| 2977 | if (nested_vmx_check_vmentry_hw(vcpu)) { | ||
| 2978 | vmx_switch_vmcs(vcpu, &vmx->vmcs01); | ||
| 2979 | return -1; | ||
| 2980 | } | ||
| 2981 | |||
| 2982 | if (nested_vmx_check_vmentry_postreqs(vcpu, vmcs12, &exit_qual)) | ||
| 2983 | goto vmentry_fail_vmexit; | ||
| 2984 | } | ||
| 2985 | |||
| 2986 | enter_guest_mode(vcpu); | ||
| 2987 | if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) | ||
| 2988 | vcpu->arch.tsc_offset += vmcs12->tsc_offset; | ||
| 2989 | |||
| 2990 | if (prepare_vmcs02(vcpu, vmcs12, &exit_qual)) | ||
| 2991 | goto vmentry_fail_vmexit_guest_mode; | ||
| 2992 | |||
| 2993 | if (from_vmentry) { | ||
| 2994 | exit_reason = EXIT_REASON_MSR_LOAD_FAIL; | ||
| 2995 | exit_qual = nested_vmx_load_msr(vcpu, | ||
| 2996 | vmcs12->vm_entry_msr_load_addr, | ||
| 2997 | vmcs12->vm_entry_msr_load_count); | ||
| 2998 | if (exit_qual) | ||
| 2999 | goto vmentry_fail_vmexit_guest_mode; | ||
| 3000 | } else { | ||
| 3001 | /* | ||
| 3002 | * The MMU is not initialized to point at the right entities yet and | ||
| 3003 | * "get pages" would need to read data from the guest (i.e. we will | ||
| 3004 | * need to perform gpa to hpa translation). Request a call | ||
| 3005 | * to nested_get_vmcs12_pages before the next VM-entry. The MSRs | ||
| 3006 | * have already been set at vmentry time and should not be reset. | ||
| 3007 | */ | ||
| 3008 | kvm_make_request(KVM_REQ_GET_VMCS12_PAGES, vcpu); | ||
| 3009 | } | ||
| 3010 | |||
| 3011 | /* | ||
| 3012 | * If L1 had a pending IRQ/NMI until it executed | ||
| 3013 | * VMLAUNCH/VMRESUME which wasn't delivered because it was | ||
| 3014 | * disallowed (e.g. interrupts disabled), L0 needs to | ||
| 3015 | * evaluate if this pending event should cause an exit from L2 | ||
| 3016 | * to L1 or delivered directly to L2 (e.g. In case L1 don't | ||
| 3017 | * intercept EXTERNAL_INTERRUPT). | ||
| 3018 | * | ||
| 3019 | * Usually this would be handled by the processor noticing an | ||
| 3020 | * IRQ/NMI window request, or checking RVI during evaluation of | ||
| 3021 | * pending virtual interrupts. However, this setting was done | ||
| 3022 | * on VMCS01 and now VMCS02 is active instead. Thus, we force L0 | ||
| 3023 | * to perform pending event evaluation by requesting a KVM_REQ_EVENT. | ||
| 3024 | */ | ||
| 3025 | if (unlikely(evaluate_pending_interrupts)) | ||
| 3026 | kvm_make_request(KVM_REQ_EVENT, vcpu); | ||
| 3027 | |||
| 3028 | /* | ||
| 3029 | * Note no nested_vmx_succeed or nested_vmx_fail here. At this point | ||
| 3030 | * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet | ||
| 3031 | * returned as far as L1 is concerned. It will only return (and set | ||
| 3032 | * the success flag) when L2 exits (see nested_vmx_vmexit()). | ||
| 3033 | */ | ||
| 3034 | return 0; | ||
| 3035 | |||
| 3036 | /* | ||
| 3037 | * A failed consistency check that leads to a VMExit during L1's | ||
| 3038 | * VMEnter to L2 is a variation of a normal VMexit, as explained in | ||
| 3039 | * 26.7 "VM-entry failures during or after loading guest state". | ||
| 3040 | */ | ||
| 3041 | vmentry_fail_vmexit_guest_mode: | ||
| 3042 | if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) | ||
| 3043 | vcpu->arch.tsc_offset -= vmcs12->tsc_offset; | ||
| 3044 | leave_guest_mode(vcpu); | ||
| 3045 | |||
| 3046 | vmentry_fail_vmexit: | ||
| 3047 | vmx_switch_vmcs(vcpu, &vmx->vmcs01); | ||
| 3048 | |||
| 3049 | if (!from_vmentry) | ||
| 3050 | return 1; | ||
| 3051 | |||
| 3052 | load_vmcs12_host_state(vcpu, vmcs12); | ||
| 3053 | vmcs12->vm_exit_reason = exit_reason | VMX_EXIT_REASONS_FAILED_VMENTRY; | ||
| 3054 | vmcs12->exit_qualification = exit_qual; | ||
| 3055 | if (enable_shadow_vmcs || vmx->nested.hv_evmcs) | ||
| 3056 | vmx->nested.need_vmcs12_sync = true; | ||
| 3057 | return 1; | ||
| 3058 | } | ||
| 3059 | |||
| 3060 | /* | ||
| 3061 | * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1 | ||
| 3062 | * for running an L2 nested guest. | ||
| 3063 | */ | ||
| 3064 | static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) | ||
| 3065 | { | ||
| 3066 | struct vmcs12 *vmcs12; | ||
| 3067 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 3068 | u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu); | ||
| 3069 | int ret; | ||
| 3070 | |||
| 3071 | if (!nested_vmx_check_permission(vcpu)) | ||
| 3072 | return 1; | ||
| 3073 | |||
| 3074 | if (!nested_vmx_handle_enlightened_vmptrld(vcpu, true)) | ||
| 3075 | return 1; | ||
| 3076 | |||
| 3077 | if (!vmx->nested.hv_evmcs && vmx->nested.current_vmptr == -1ull) | ||
| 3078 | return nested_vmx_failInvalid(vcpu); | ||
| 3079 | |||
| 3080 | vmcs12 = get_vmcs12(vcpu); | ||
| 3081 | |||
| 3082 | /* | ||
| 3083 | * Can't VMLAUNCH or VMRESUME a shadow VMCS. Despite the fact | ||
| 3084 | * that there *is* a valid VMCS pointer, RFLAGS.CF is set | ||
| 3085 | * rather than RFLAGS.ZF, and no error number is stored to the | ||
| 3086 | * VM-instruction error field. | ||
| 3087 | */ | ||
| 3088 | if (vmcs12->hdr.shadow_vmcs) | ||
| 3089 | return nested_vmx_failInvalid(vcpu); | ||
| 3090 | |||
| 3091 | if (vmx->nested.hv_evmcs) { | ||
| 3092 | copy_enlightened_to_vmcs12(vmx); | ||
| 3093 | /* Enlightened VMCS doesn't have launch state */ | ||
| 3094 | vmcs12->launch_state = !launch; | ||
| 3095 | } else if (enable_shadow_vmcs) { | ||
| 3096 | copy_shadow_to_vmcs12(vmx); | ||
| 3097 | } | ||
| 3098 | |||
| 3099 | /* | ||
| 3100 | * The nested entry process starts with enforcing various prerequisites | ||
| 3101 | * on vmcs12 as required by the Intel SDM, and act appropriately when | ||
| 3102 | * they fail: As the SDM explains, some conditions should cause the | ||
| 3103 | * instruction to fail, while others will cause the instruction to seem | ||
| 3104 | * to succeed, but return an EXIT_REASON_INVALID_STATE. | ||
| 3105 | * To speed up the normal (success) code path, we should avoid checking | ||
| 3106 | * for misconfigurations which will anyway be caught by the processor | ||
| 3107 | * when using the merged vmcs02. | ||
| 3108 | */ | ||
| 3109 | if (interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS) | ||
| 3110 | return nested_vmx_failValid(vcpu, | ||
| 3111 | VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS); | ||
| 3112 | |||
| 3113 | if (vmcs12->launch_state == launch) | ||
| 3114 | return nested_vmx_failValid(vcpu, | ||
| 3115 | launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS | ||
| 3116 | : VMXERR_VMRESUME_NONLAUNCHED_VMCS); | ||
| 3117 | |||
| 3118 | ret = nested_vmx_check_vmentry_prereqs(vcpu, vmcs12); | ||
| 3119 | if (ret) | ||
| 3120 | return nested_vmx_failValid(vcpu, ret); | ||
| 3121 | |||
| 3122 | /* | ||
| 3123 | * We're finally done with prerequisite checking, and can start with | ||
| 3124 | * the nested entry. | ||
| 3125 | */ | ||
| 3126 | vmx->nested.nested_run_pending = 1; | ||
| 3127 | ret = nested_vmx_enter_non_root_mode(vcpu, true); | ||
| 3128 | vmx->nested.nested_run_pending = !ret; | ||
| 3129 | if (ret > 0) | ||
| 3130 | return 1; | ||
| 3131 | else if (ret) | ||
| 3132 | return nested_vmx_failValid(vcpu, | ||
| 3133 | VMXERR_ENTRY_INVALID_CONTROL_FIELD); | ||
| 3134 | |||
| 3135 | /* Hide L1D cache contents from the nested guest. */ | ||
| 3136 | vmx->vcpu.arch.l1tf_flush_l1d = true; | ||
| 3137 | |||
| 3138 | /* | ||
| 3139 | * Must happen outside of nested_vmx_enter_non_root_mode() as it will | ||
| 3140 | * also be used as part of restoring nVMX state for | ||
| 3141 | * snapshot restore (migration). | ||
| 3142 | * | ||
| 3143 | * In this flow, it is assumed that vmcs12 cache was | ||
| 3144 | * trasferred as part of captured nVMX state and should | ||
| 3145 | * therefore not be read from guest memory (which may not | ||
| 3146 | * exist on destination host yet). | ||
| 3147 | */ | ||
| 3148 | nested_cache_shadow_vmcs12(vcpu, vmcs12); | ||
| 3149 | |||
| 3150 | /* | ||
| 3151 | * If we're entering a halted L2 vcpu and the L2 vcpu won't be | ||
| 3152 | * awakened by event injection or by an NMI-window VM-exit or | ||
| 3153 | * by an interrupt-window VM-exit, halt the vcpu. | ||
| 3154 | */ | ||
| 3155 | if ((vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT) && | ||
| 3156 | !(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) && | ||
| 3157 | !(vmcs12->cpu_based_vm_exec_control & CPU_BASED_VIRTUAL_NMI_PENDING) && | ||
| 3158 | !((vmcs12->cpu_based_vm_exec_control & CPU_BASED_VIRTUAL_INTR_PENDING) && | ||
| 3159 | (vmcs12->guest_rflags & X86_EFLAGS_IF))) { | ||
| 3160 | vmx->nested.nested_run_pending = 0; | ||
| 3161 | return kvm_vcpu_halt(vcpu); | ||
| 3162 | } | ||
| 3163 | return 1; | ||
| 3164 | } | ||
| 3165 | |||
| 3166 | /* | ||
| 3167 | * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date | ||
| 3168 | * because L2 may have changed some cr0 bits directly (CRO_GUEST_HOST_MASK). | ||
| 3169 | * This function returns the new value we should put in vmcs12.guest_cr0. | ||
| 3170 | * It's not enough to just return the vmcs02 GUEST_CR0. Rather, | ||
| 3171 | * 1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now | ||
| 3172 | * available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0 | ||
| 3173 | * didn't trap the bit, because if L1 did, so would L0). | ||
| 3174 | * 2. Bits that L1 asked to trap (and therefore L0 also did) could not have | ||
| 3175 | * been modified by L2, and L1 knows it. So just leave the old value of | ||
| 3176 | * the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0 | ||
| 3177 | * isn't relevant, because if L0 traps this bit it can set it to anything. | ||
| 3178 | * 3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have | ||
| 3179 | * changed these bits, and therefore they need to be updated, but L0 | ||
| 3180 | * didn't necessarily allow them to be changed in GUEST_CR0 - and rather | ||
| 3181 | * put them in vmcs02 CR0_READ_SHADOW. So take these bits from there. | ||
| 3182 | */ | ||
| 3183 | static inline unsigned long | ||
| 3184 | vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) | ||
| 3185 | { | ||
| 3186 | return | ||
| 3187 | /*1*/ (vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) | | ||
| 3188 | /*2*/ (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) | | ||
| 3189 | /*3*/ (vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask | | ||
| 3190 | vcpu->arch.cr0_guest_owned_bits)); | ||
| 3191 | } | ||
| 3192 | |||
| 3193 | static inline unsigned long | ||
| 3194 | vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) | ||
| 3195 | { | ||
| 3196 | return | ||
| 3197 | /*1*/ (vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) | | ||
| 3198 | /*2*/ (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) | | ||
| 3199 | /*3*/ (vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask | | ||
| 3200 | vcpu->arch.cr4_guest_owned_bits)); | ||
| 3201 | } | ||
| 3202 | |||
| 3203 | static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu, | ||
| 3204 | struct vmcs12 *vmcs12) | ||
| 3205 | { | ||
| 3206 | u32 idt_vectoring; | ||
| 3207 | unsigned int nr; | ||
| 3208 | |||
| 3209 | if (vcpu->arch.exception.injected) { | ||
| 3210 | nr = vcpu->arch.exception.nr; | ||
| 3211 | idt_vectoring = nr | VECTORING_INFO_VALID_MASK; | ||
| 3212 | |||
| 3213 | if (kvm_exception_is_soft(nr)) { | ||
| 3214 | vmcs12->vm_exit_instruction_len = | ||
| 3215 | vcpu->arch.event_exit_inst_len; | ||
| 3216 | idt_vectoring |= INTR_TYPE_SOFT_EXCEPTION; | ||
| 3217 | } else | ||
| 3218 | idt_vectoring |= INTR_TYPE_HARD_EXCEPTION; | ||
| 3219 | |||
| 3220 | if (vcpu->arch.exception.has_error_code) { | ||
| 3221 | idt_vectoring |= VECTORING_INFO_DELIVER_CODE_MASK; | ||
| 3222 | vmcs12->idt_vectoring_error_code = | ||
| 3223 | vcpu->arch.exception.error_code; | ||
| 3224 | } | ||
| 3225 | |||
| 3226 | vmcs12->idt_vectoring_info_field = idt_vectoring; | ||
| 3227 | } else if (vcpu->arch.nmi_injected) { | ||
| 3228 | vmcs12->idt_vectoring_info_field = | ||
| 3229 | INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR; | ||
| 3230 | } else if (vcpu->arch.interrupt.injected) { | ||
| 3231 | nr = vcpu->arch.interrupt.nr; | ||
| 3232 | idt_vectoring = nr | VECTORING_INFO_VALID_MASK; | ||
| 3233 | |||
| 3234 | if (vcpu->arch.interrupt.soft) { | ||
| 3235 | idt_vectoring |= INTR_TYPE_SOFT_INTR; | ||
| 3236 | vmcs12->vm_entry_instruction_len = | ||
| 3237 | vcpu->arch.event_exit_inst_len; | ||
| 3238 | } else | ||
| 3239 | idt_vectoring |= INTR_TYPE_EXT_INTR; | ||
| 3240 | |||
| 3241 | vmcs12->idt_vectoring_info_field = idt_vectoring; | ||
| 3242 | } | ||
| 3243 | } | ||
| 3244 | |||
| 3245 | |||
| 3246 | static void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu) | ||
| 3247 | { | ||
| 3248 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); | ||
| 3249 | gfn_t gfn; | ||
| 3250 | |||
| 3251 | /* | ||
| 3252 | * Don't need to mark the APIC access page dirty; it is never | ||
| 3253 | * written to by the CPU during APIC virtualization. | ||
| 3254 | */ | ||
| 3255 | |||
| 3256 | if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { | ||
| 3257 | gfn = vmcs12->virtual_apic_page_addr >> PAGE_SHIFT; | ||
| 3258 | kvm_vcpu_mark_page_dirty(vcpu, gfn); | ||
| 3259 | } | ||
| 3260 | |||
| 3261 | if (nested_cpu_has_posted_intr(vmcs12)) { | ||
| 3262 | gfn = vmcs12->posted_intr_desc_addr >> PAGE_SHIFT; | ||
| 3263 | kvm_vcpu_mark_page_dirty(vcpu, gfn); | ||
| 3264 | } | ||
| 3265 | } | ||
| 3266 | |||
| 3267 | static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) | ||
| 3268 | { | ||
| 3269 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 3270 | int max_irr; | ||
| 3271 | void *vapic_page; | ||
| 3272 | u16 status; | ||
| 3273 | |||
| 3274 | if (!vmx->nested.pi_desc || !vmx->nested.pi_pending) | ||
| 3275 | return; | ||
| 3276 | |||
| 3277 | vmx->nested.pi_pending = false; | ||
| 3278 | if (!pi_test_and_clear_on(vmx->nested.pi_desc)) | ||
| 3279 | return; | ||
| 3280 | |||
| 3281 | max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256); | ||
| 3282 | if (max_irr != 256) { | ||
| 3283 | vapic_page = kmap(vmx->nested.virtual_apic_page); | ||
| 3284 | __kvm_apic_update_irr(vmx->nested.pi_desc->pir, | ||
| 3285 | vapic_page, &max_irr); | ||
| 3286 | kunmap(vmx->nested.virtual_apic_page); | ||
| 3287 | |||
| 3288 | status = vmcs_read16(GUEST_INTR_STATUS); | ||
| 3289 | if ((u8)max_irr > ((u8)status & 0xff)) { | ||
| 3290 | status &= ~0xff; | ||
| 3291 | status |= (u8)max_irr; | ||
| 3292 | vmcs_write16(GUEST_INTR_STATUS, status); | ||
| 3293 | } | ||
| 3294 | } | ||
| 3295 | |||
| 3296 | nested_mark_vmcs12_pages_dirty(vcpu); | ||
| 3297 | } | ||
| 3298 | |||
| 3299 | static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu, | ||
| 3300 | unsigned long exit_qual) | ||
| 3301 | { | ||
| 3302 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); | ||
| 3303 | unsigned int nr = vcpu->arch.exception.nr; | ||
| 3304 | u32 intr_info = nr | INTR_INFO_VALID_MASK; | ||
| 3305 | |||
| 3306 | if (vcpu->arch.exception.has_error_code) { | ||
| 3307 | vmcs12->vm_exit_intr_error_code = vcpu->arch.exception.error_code; | ||
| 3308 | intr_info |= INTR_INFO_DELIVER_CODE_MASK; | ||
| 3309 | } | ||
| 3310 | |||
| 3311 | if (kvm_exception_is_soft(nr)) | ||
| 3312 | intr_info |= INTR_TYPE_SOFT_EXCEPTION; | ||
| 3313 | else | ||
| 3314 | intr_info |= INTR_TYPE_HARD_EXCEPTION; | ||
| 3315 | |||
| 3316 | if (!(vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) && | ||
| 3317 | vmx_get_nmi_mask(vcpu)) | ||
| 3318 | intr_info |= INTR_INFO_UNBLOCK_NMI; | ||
| 3319 | |||
| 3320 | nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual); | ||
| 3321 | } | ||
| 3322 | |||
| 3323 | static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr) | ||
| 3324 | { | ||
| 3325 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 3326 | unsigned long exit_qual; | ||
| 3327 | bool block_nested_events = | ||
| 3328 | vmx->nested.nested_run_pending || kvm_event_needs_reinjection(vcpu); | ||
| 3329 | |||
| 3330 | if (vcpu->arch.exception.pending && | ||
| 3331 | nested_vmx_check_exception(vcpu, &exit_qual)) { | ||
| 3332 | if (block_nested_events) | ||
| 3333 | return -EBUSY; | ||
| 3334 | nested_vmx_inject_exception_vmexit(vcpu, exit_qual); | ||
| 3335 | return 0; | ||
| 3336 | } | ||
| 3337 | |||
| 3338 | if (nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) && | ||
| 3339 | vmx->nested.preemption_timer_expired) { | ||
| 3340 | if (block_nested_events) | ||
| 3341 | return -EBUSY; | ||
| 3342 | nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0); | ||
| 3343 | return 0; | ||
| 3344 | } | ||
| 3345 | |||
| 3346 | if (vcpu->arch.nmi_pending && nested_exit_on_nmi(vcpu)) { | ||
| 3347 | if (block_nested_events) | ||
| 3348 | return -EBUSY; | ||
| 3349 | nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, | ||
| 3350 | NMI_VECTOR | INTR_TYPE_NMI_INTR | | ||
| 3351 | INTR_INFO_VALID_MASK, 0); | ||
| 3352 | /* | ||
| 3353 | * The NMI-triggered VM exit counts as injection: | ||
| 3354 | * clear this one and block further NMIs. | ||
| 3355 | */ | ||
| 3356 | vcpu->arch.nmi_pending = 0; | ||
| 3357 | vmx_set_nmi_mask(vcpu, true); | ||
| 3358 | return 0; | ||
| 3359 | } | ||
| 3360 | |||
| 3361 | if ((kvm_cpu_has_interrupt(vcpu) || external_intr) && | ||
| 3362 | nested_exit_on_intr(vcpu)) { | ||
| 3363 | if (block_nested_events) | ||
| 3364 | return -EBUSY; | ||
| 3365 | nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0); | ||
| 3366 | return 0; | ||
| 3367 | } | ||
| 3368 | |||
| 3369 | vmx_complete_nested_posted_interrupt(vcpu); | ||
| 3370 | return 0; | ||
| 3371 | } | ||
| 3372 | |||
| 3373 | static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu) | ||
| 3374 | { | ||
| 3375 | ktime_t remaining = | ||
| 3376 | hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer); | ||
| 3377 | u64 value; | ||
| 3378 | |||
| 3379 | if (ktime_to_ns(remaining) <= 0) | ||
| 3380 | return 0; | ||
| 3381 | |||
| 3382 | value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz; | ||
| 3383 | do_div(value, 1000000); | ||
| 3384 | return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; | ||
| 3385 | } | ||
| 3386 | |||
| 3387 | /* | ||
| 3388 | * Update the guest state fields of vmcs12 to reflect changes that | ||
| 3389 | * occurred while L2 was running. (The "IA-32e mode guest" bit of the | ||
| 3390 | * VM-entry controls is also updated, since this is really a guest | ||
| 3391 | * state bit.) | ||
| 3392 | */ | ||
| 3393 | static void sync_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) | ||
| 3394 | { | ||
| 3395 | vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12); | ||
| 3396 | vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12); | ||
| 3397 | |||
| 3398 | vmcs12->guest_rsp = kvm_register_read(vcpu, VCPU_REGS_RSP); | ||
| 3399 | vmcs12->guest_rip = kvm_register_read(vcpu, VCPU_REGS_RIP); | ||
| 3400 | vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS); | ||
| 3401 | |||
| 3402 | vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR); | ||
| 3403 | vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR); | ||
| 3404 | vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR); | ||
| 3405 | vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR); | ||
| 3406 | vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR); | ||
| 3407 | vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR); | ||
| 3408 | vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR); | ||
| 3409 | vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR); | ||
| 3410 | vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT); | ||
| 3411 | vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT); | ||
| 3412 | vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT); | ||
| 3413 | vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT); | ||
| 3414 | vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT); | ||
| 3415 | vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT); | ||
| 3416 | vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT); | ||
| 3417 | vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT); | ||
| 3418 | vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT); | ||
| 3419 | vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT); | ||
| 3420 | vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES); | ||
| 3421 | vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES); | ||
| 3422 | vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES); | ||
| 3423 | vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES); | ||
| 3424 | vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES); | ||
| 3425 | vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES); | ||
| 3426 | vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES); | ||
| 3427 | vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES); | ||
| 3428 | vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE); | ||
| 3429 | vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE); | ||
| 3430 | vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE); | ||
| 3431 | vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE); | ||
| 3432 | vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE); | ||
| 3433 | vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE); | ||
| 3434 | vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE); | ||
| 3435 | vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE); | ||
| 3436 | vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE); | ||
| 3437 | vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE); | ||
| 3438 | |||
| 3439 | vmcs12->guest_interruptibility_info = | ||
| 3440 | vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); | ||
| 3441 | vmcs12->guest_pending_dbg_exceptions = | ||
| 3442 | vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS); | ||
| 3443 | if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) | ||
| 3444 | vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT; | ||
| 3445 | else | ||
| 3446 | vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE; | ||
| 3447 | |||
| 3448 | if (nested_cpu_has_preemption_timer(vmcs12)) { | ||
| 3449 | if (vmcs12->vm_exit_controls & | ||
| 3450 | VM_EXIT_SAVE_VMX_PREEMPTION_TIMER) | ||
| 3451 | vmcs12->vmx_preemption_timer_value = | ||
| 3452 | vmx_get_preemption_timer_value(vcpu); | ||
| 3453 | hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer); | ||
| 3454 | } | ||
| 3455 | |||
| 3456 | /* | ||
| 3457 | * In some cases (usually, nested EPT), L2 is allowed to change its | ||
| 3458 | * own CR3 without exiting. If it has changed it, we must keep it. | ||
| 3459 | * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined | ||
| 3460 | * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12. | ||
| 3461 | * | ||
| 3462 | * Additionally, restore L2's PDPTR to vmcs12. | ||
| 3463 | */ | ||
| 3464 | if (enable_ept) { | ||
| 3465 | vmcs12->guest_cr3 = vmcs_readl(GUEST_CR3); | ||
| 3466 | vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0); | ||
| 3467 | vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1); | ||
| 3468 | vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2); | ||
| 3469 | vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3); | ||
| 3470 | } | ||
| 3471 | |||
| 3472 | vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS); | ||
| 3473 | |||
| 3474 | if (nested_cpu_has_vid(vmcs12)) | ||
| 3475 | vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS); | ||
| 3476 | |||
| 3477 | vmcs12->vm_entry_controls = | ||
| 3478 | (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) | | ||
| 3479 | (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE); | ||
| 3480 | |||
| 3481 | if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) { | ||
| 3482 | kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7); | ||
| 3483 | vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); | ||
| 3484 | } | ||
| 3485 | |||
| 3486 | /* TODO: These cannot have changed unless we have MSR bitmaps and | ||
| 3487 | * the relevant bit asks not to trap the change */ | ||
| 3488 | if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT) | ||
| 3489 | vmcs12->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT); | ||
| 3490 | if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER) | ||
| 3491 | vmcs12->guest_ia32_efer = vcpu->arch.efer; | ||
| 3492 | vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS); | ||
| 3493 | vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP); | ||
| 3494 | vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP); | ||
| 3495 | if (kvm_mpx_supported()) | ||
| 3496 | vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); | ||
| 3497 | } | ||
| 3498 | |||
| 3499 | /* | ||
| 3500 | * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits | ||
| 3501 | * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12), | ||
| 3502 | * and this function updates it to reflect the changes to the guest state while | ||
| 3503 | * L2 was running (and perhaps made some exits which were handled directly by L0 | ||
| 3504 | * without going back to L1), and to reflect the exit reason. | ||
| 3505 | * Note that we do not have to copy here all VMCS fields, just those that | ||
| 3506 | * could have changed by the L2 guest or the exit - i.e., the guest-state and | ||
| 3507 | * exit-information fields only. Other fields are modified by L1 with VMWRITE, | ||
| 3508 | * which already writes to vmcs12 directly. | ||
| 3509 | */ | ||
| 3510 | static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, | ||
| 3511 | u32 exit_reason, u32 exit_intr_info, | ||
| 3512 | unsigned long exit_qualification) | ||
| 3513 | { | ||
| 3514 | /* update guest state fields: */ | ||
| 3515 | sync_vmcs12(vcpu, vmcs12); | ||
| 3516 | |||
| 3517 | /* update exit information fields: */ | ||
| 3518 | |||
| 3519 | vmcs12->vm_exit_reason = exit_reason; | ||
| 3520 | vmcs12->exit_qualification = exit_qualification; | ||
| 3521 | vmcs12->vm_exit_intr_info = exit_intr_info; | ||
| 3522 | |||
| 3523 | vmcs12->idt_vectoring_info_field = 0; | ||
| 3524 | vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); | ||
| 3525 | vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); | ||
| 3526 | |||
| 3527 | if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) { | ||
| 3528 | vmcs12->launch_state = 1; | ||
| 3529 | |||
| 3530 | /* vm_entry_intr_info_field is cleared on exit. Emulate this | ||
| 3531 | * instead of reading the real value. */ | ||
| 3532 | vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK; | ||
| 3533 | |||
| 3534 | /* | ||
| 3535 | * Transfer the event that L0 or L1 may wanted to inject into | ||
| 3536 | * L2 to IDT_VECTORING_INFO_FIELD. | ||
| 3537 | */ | ||
| 3538 | vmcs12_save_pending_event(vcpu, vmcs12); | ||
| 3539 | |||
| 3540 | /* | ||
| 3541 | * According to spec, there's no need to store the guest's | ||
| 3542 | * MSRs if the exit is due to a VM-entry failure that occurs | ||
| 3543 | * during or after loading the guest state. Since this exit | ||
| 3544 | * does not fall in that category, we need to save the MSRs. | ||
| 3545 | */ | ||
| 3546 | if (nested_vmx_store_msr(vcpu, | ||
| 3547 | vmcs12->vm_exit_msr_store_addr, | ||
| 3548 | vmcs12->vm_exit_msr_store_count)) | ||
| 3549 | nested_vmx_abort(vcpu, | ||
| 3550 | VMX_ABORT_SAVE_GUEST_MSR_FAIL); | ||
| 3551 | } | ||
| 3552 | |||
| 3553 | /* | ||
| 3554 | * Drop what we picked up for L2 via vmx_complete_interrupts. It is | ||
| 3555 | * preserved above and would only end up incorrectly in L1. | ||
| 3556 | */ | ||
| 3557 | vcpu->arch.nmi_injected = false; | ||
| 3558 | kvm_clear_exception_queue(vcpu); | ||
| 3559 | kvm_clear_interrupt_queue(vcpu); | ||
| 3560 | } | ||
| 3561 | |||
| 3562 | /* | ||
| 3563 | * A part of what we need to when the nested L2 guest exits and we want to | ||
| 3564 | * run its L1 parent, is to reset L1's guest state to the host state specified | ||
| 3565 | * in vmcs12. | ||
| 3566 | * This function is to be called not only on normal nested exit, but also on | ||
| 3567 | * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry | ||
| 3568 | * Failures During or After Loading Guest State"). | ||
| 3569 | * This function should be called when the active VMCS is L1's (vmcs01). | ||
| 3570 | */ | ||
| 3571 | static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, | ||
| 3572 | struct vmcs12 *vmcs12) | ||
| 3573 | { | ||
| 3574 | struct kvm_segment seg; | ||
| 3575 | u32 entry_failure_code; | ||
| 3576 | |||
| 3577 | if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) | ||
| 3578 | vcpu->arch.efer = vmcs12->host_ia32_efer; | ||
| 3579 | else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) | ||
| 3580 | vcpu->arch.efer |= (EFER_LMA | EFER_LME); | ||
| 3581 | else | ||
| 3582 | vcpu->arch.efer &= ~(EFER_LMA | EFER_LME); | ||
| 3583 | vmx_set_efer(vcpu, vcpu->arch.efer); | ||
| 3584 | |||
| 3585 | kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->host_rsp); | ||
| 3586 | kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->host_rip); | ||
| 3587 | vmx_set_rflags(vcpu, X86_EFLAGS_FIXED); | ||
| 3588 | vmx_set_interrupt_shadow(vcpu, 0); | ||
| 3589 | |||
| 3590 | /* | ||
| 3591 | * Note that calling vmx_set_cr0 is important, even if cr0 hasn't | ||
| 3592 | * actually changed, because vmx_set_cr0 refers to efer set above. | ||
| 3593 | * | ||
| 3594 | * CR0_GUEST_HOST_MASK is already set in the original vmcs01 | ||
| 3595 | * (KVM doesn't change it); | ||
| 3596 | */ | ||
| 3597 | vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS; | ||
| 3598 | vmx_set_cr0(vcpu, vmcs12->host_cr0); | ||
| 3599 | |||
| 3600 | /* Same as above - no reason to call set_cr4_guest_host_mask(). */ | ||
| 3601 | vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); | ||
| 3602 | vmx_set_cr4(vcpu, vmcs12->host_cr4); | ||
| 3603 | |||
| 3604 | nested_ept_uninit_mmu_context(vcpu); | ||
| 3605 | |||
| 3606 | /* | ||
| 3607 | * Only PDPTE load can fail as the value of cr3 was checked on entry and | ||
| 3608 | * couldn't have changed. | ||
| 3609 | */ | ||
| 3610 | if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &entry_failure_code)) | ||
| 3611 | nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL); | ||
| 3612 | |||
| 3613 | if (!enable_ept) | ||
| 3614 | vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault; | ||
| 3615 | |||
| 3616 | /* | ||
| 3617 | * If vmcs01 doesn't use VPID, CPU flushes TLB on every | ||
| 3618 | * VMEntry/VMExit. Thus, no need to flush TLB. | ||
| 3619 | * | ||
| 3620 | * If vmcs12 doesn't use VPID, L1 expects TLB to be | ||
| 3621 | * flushed on every VMEntry/VMExit. | ||
| 3622 | * | ||
| 3623 | * Otherwise, we can preserve TLB entries as long as we are | ||
| 3624 | * able to tag L1 TLB entries differently than L2 TLB entries. | ||
| 3625 | * | ||
| 3626 | * If vmcs12 uses EPT, we need to execute this flush on EPTP01 | ||
| 3627 | * and therefore we request the TLB flush to happen only after VMCS EPTP | ||
| 3628 | * has been set by KVM_REQ_LOAD_CR3. | ||
| 3629 | */ | ||
| 3630 | if (enable_vpid && | ||
| 3631 | (!nested_cpu_has_vpid(vmcs12) || !nested_has_guest_tlb_tag(vcpu))) { | ||
| 3632 | kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); | ||
| 3633 | } | ||
| 3634 | |||
| 3635 | vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs); | ||
| 3636 | vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp); | ||
| 3637 | vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip); | ||
| 3638 | vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base); | ||
| 3639 | vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base); | ||
| 3640 | vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF); | ||
| 3641 | vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF); | ||
| 3642 | |||
| 3643 | /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */ | ||
| 3644 | if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS) | ||
| 3645 | vmcs_write64(GUEST_BNDCFGS, 0); | ||
| 3646 | |||
| 3647 | if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) { | ||
| 3648 | vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat); | ||
| 3649 | vcpu->arch.pat = vmcs12->host_ia32_pat; | ||
| 3650 | } | ||
| 3651 | if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) | ||
| 3652 | vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL, | ||
| 3653 | vmcs12->host_ia32_perf_global_ctrl); | ||
| 3654 | |||
| 3655 | /* Set L1 segment info according to Intel SDM | ||
| 3656 | 27.5.2 Loading Host Segment and Descriptor-Table Registers */ | ||
| 3657 | seg = (struct kvm_segment) { | ||
| 3658 | .base = 0, | ||
| 3659 | .limit = 0xFFFFFFFF, | ||
| 3660 | .selector = vmcs12->host_cs_selector, | ||
| 3661 | .type = 11, | ||
| 3662 | .present = 1, | ||
| 3663 | .s = 1, | ||
| 3664 | .g = 1 | ||
| 3665 | }; | ||
| 3666 | if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) | ||
| 3667 | seg.l = 1; | ||
| 3668 | else | ||
| 3669 | seg.db = 1; | ||
| 3670 | vmx_set_segment(vcpu, &seg, VCPU_SREG_CS); | ||
| 3671 | seg = (struct kvm_segment) { | ||
| 3672 | .base = 0, | ||
| 3673 | .limit = 0xFFFFFFFF, | ||
| 3674 | .type = 3, | ||
| 3675 | .present = 1, | ||
| 3676 | .s = 1, | ||
| 3677 | .db = 1, | ||
| 3678 | .g = 1 | ||
| 3679 | }; | ||
| 3680 | seg.selector = vmcs12->host_ds_selector; | ||
| 3681 | vmx_set_segment(vcpu, &seg, VCPU_SREG_DS); | ||
| 3682 | seg.selector = vmcs12->host_es_selector; | ||
| 3683 | vmx_set_segment(vcpu, &seg, VCPU_SREG_ES); | ||
| 3684 | seg.selector = vmcs12->host_ss_selector; | ||
| 3685 | vmx_set_segment(vcpu, &seg, VCPU_SREG_SS); | ||
| 3686 | seg.selector = vmcs12->host_fs_selector; | ||
| 3687 | seg.base = vmcs12->host_fs_base; | ||
| 3688 | vmx_set_segment(vcpu, &seg, VCPU_SREG_FS); | ||
| 3689 | seg.selector = vmcs12->host_gs_selector; | ||
| 3690 | seg.base = vmcs12->host_gs_base; | ||
| 3691 | vmx_set_segment(vcpu, &seg, VCPU_SREG_GS); | ||
| 3692 | seg = (struct kvm_segment) { | ||
| 3693 | .base = vmcs12->host_tr_base, | ||
| 3694 | .limit = 0x67, | ||
| 3695 | .selector = vmcs12->host_tr_selector, | ||
| 3696 | .type = 11, | ||
| 3697 | .present = 1 | ||
| 3698 | }; | ||
| 3699 | vmx_set_segment(vcpu, &seg, VCPU_SREG_TR); | ||
| 3700 | |||
| 3701 | kvm_set_dr(vcpu, 7, 0x400); | ||
| 3702 | vmcs_write64(GUEST_IA32_DEBUGCTL, 0); | ||
| 3703 | |||
| 3704 | if (cpu_has_vmx_msr_bitmap()) | ||
| 3705 | vmx_update_msr_bitmap(vcpu); | ||
| 3706 | |||
| 3707 | if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr, | ||
| 3708 | vmcs12->vm_exit_msr_load_count)) | ||
| 3709 | nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); | ||
| 3710 | } | ||
| 3711 | |||
| 3712 | static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx) | ||
| 3713 | { | ||
| 3714 | struct shared_msr_entry *efer_msr; | ||
| 3715 | unsigned int i; | ||
| 3716 | |||
| 3717 | if (vm_entry_controls_get(vmx) & VM_ENTRY_LOAD_IA32_EFER) | ||
| 3718 | return vmcs_read64(GUEST_IA32_EFER); | ||
| 3719 | |||
| 3720 | if (cpu_has_load_ia32_efer()) | ||
| 3721 | return host_efer; | ||
| 3722 | |||
| 3723 | for (i = 0; i < vmx->msr_autoload.guest.nr; ++i) { | ||
| 3724 | if (vmx->msr_autoload.guest.val[i].index == MSR_EFER) | ||
| 3725 | return vmx->msr_autoload.guest.val[i].value; | ||
| 3726 | } | ||
| 3727 | |||
| 3728 | efer_msr = find_msr_entry(vmx, MSR_EFER); | ||
| 3729 | if (efer_msr) | ||
| 3730 | return efer_msr->data; | ||
| 3731 | |||
| 3732 | return host_efer; | ||
| 3733 | } | ||
| 3734 | |||
| 3735 | static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu) | ||
| 3736 | { | ||
| 3737 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); | ||
| 3738 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 3739 | struct vmx_msr_entry g, h; | ||
| 3740 | struct msr_data msr; | ||
| 3741 | gpa_t gpa; | ||
| 3742 | u32 i, j; | ||
| 3743 | |||
| 3744 | vcpu->arch.pat = vmcs_read64(GUEST_IA32_PAT); | ||
| 3745 | |||
| 3746 | if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) { | ||
| 3747 | /* | ||
| 3748 | * L1's host DR7 is lost if KVM_GUESTDBG_USE_HW_BP is set | ||
| 3749 | * as vmcs01.GUEST_DR7 contains a userspace defined value | ||
| 3750 | * and vcpu->arch.dr7 is not squirreled away before the | ||
| 3751 | * nested VMENTER (not worth adding a variable in nested_vmx). | ||
| 3752 | */ | ||
| 3753 | if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) | ||
| 3754 | kvm_set_dr(vcpu, 7, DR7_FIXED_1); | ||
| 3755 | else | ||
| 3756 | WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7))); | ||
| 3757 | } | ||
| 3758 | |||
| 3759 | /* | ||
| 3760 | * Note that calling vmx_set_{efer,cr0,cr4} is important as they | ||
| 3761 | * handle a variety of side effects to KVM's software model. | ||
| 3762 | */ | ||
| 3763 | vmx_set_efer(vcpu, nested_vmx_get_vmcs01_guest_efer(vmx)); | ||
| 3764 | |||
| 3765 | vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS; | ||
| 3766 | vmx_set_cr0(vcpu, vmcs_readl(CR0_READ_SHADOW)); | ||
| 3767 | |||
| 3768 | vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); | ||
| 3769 | vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW)); | ||
| 3770 | |||
| 3771 | nested_ept_uninit_mmu_context(vcpu); | ||
| 3772 | vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); | ||
| 3773 | __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); | ||
| 3774 | |||
| 3775 | /* | ||
| 3776 | * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs | ||
| 3777 | * from vmcs01 (if necessary). The PDPTRs are not loaded on | ||
| 3778 | * VMFail, like everything else we just need to ensure our | ||
| 3779 | * software model is up-to-date. | ||
| 3780 | */ | ||
| 3781 | ept_save_pdptrs(vcpu); | ||
| 3782 | |||
| 3783 | kvm_mmu_reset_context(vcpu); | ||
| 3784 | |||
| 3785 | if (cpu_has_vmx_msr_bitmap()) | ||
| 3786 | vmx_update_msr_bitmap(vcpu); | ||
| 3787 | |||
| 3788 | /* | ||
| 3789 | * This nasty bit of open coding is a compromise between blindly | ||
| 3790 | * loading L1's MSRs using the exit load lists (incorrect emulation | ||
| 3791 | * of VMFail), leaving the nested VM's MSRs in the software model | ||
| 3792 | * (incorrect behavior) and snapshotting the modified MSRs (too | ||
| 3793 | * expensive since the lists are unbound by hardware). For each | ||
| 3794 | * MSR that was (prematurely) loaded from the nested VMEntry load | ||
| 3795 | * list, reload it from the exit load list if it exists and differs | ||
| 3796 | * from the guest value. The intent is to stuff host state as | ||
| 3797 | * silently as possible, not to fully process the exit load list. | ||
| 3798 | */ | ||
| 3799 | msr.host_initiated = false; | ||
| 3800 | for (i = 0; i < vmcs12->vm_entry_msr_load_count; i++) { | ||
| 3801 | gpa = vmcs12->vm_entry_msr_load_addr + (i * sizeof(g)); | ||
| 3802 | if (kvm_vcpu_read_guest(vcpu, gpa, &g, sizeof(g))) { | ||
| 3803 | pr_debug_ratelimited( | ||
| 3804 | "%s read MSR index failed (%u, 0x%08llx)\n", | ||
| 3805 | __func__, i, gpa); | ||
| 3806 | goto vmabort; | ||
| 3807 | } | ||
| 3808 | |||
| 3809 | for (j = 0; j < vmcs12->vm_exit_msr_load_count; j++) { | ||
| 3810 | gpa = vmcs12->vm_exit_msr_load_addr + (j * sizeof(h)); | ||
| 3811 | if (kvm_vcpu_read_guest(vcpu, gpa, &h, sizeof(h))) { | ||
| 3812 | pr_debug_ratelimited( | ||
| 3813 | "%s read MSR failed (%u, 0x%08llx)\n", | ||
| 3814 | __func__, j, gpa); | ||
| 3815 | goto vmabort; | ||
| 3816 | } | ||
| 3817 | if (h.index != g.index) | ||
| 3818 | continue; | ||
| 3819 | if (h.value == g.value) | ||
| 3820 | break; | ||
| 3821 | |||
| 3822 | if (nested_vmx_load_msr_check(vcpu, &h)) { | ||
| 3823 | pr_debug_ratelimited( | ||
| 3824 | "%s check failed (%u, 0x%x, 0x%x)\n", | ||
| 3825 | __func__, j, h.index, h.reserved); | ||
| 3826 | goto vmabort; | ||
| 3827 | } | ||
| 3828 | |||
| 3829 | msr.index = h.index; | ||
| 3830 | msr.data = h.value; | ||
| 3831 | if (kvm_set_msr(vcpu, &msr)) { | ||
| 3832 | pr_debug_ratelimited( | ||
| 3833 | "%s WRMSR failed (%u, 0x%x, 0x%llx)\n", | ||
| 3834 | __func__, j, h.index, h.value); | ||
| 3835 | goto vmabort; | ||
| 3836 | } | ||
| 3837 | } | ||
| 3838 | } | ||
| 3839 | |||
| 3840 | return; | ||
| 3841 | |||
| 3842 | vmabort: | ||
| 3843 | nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); | ||
| 3844 | } | ||
| 3845 | |||
| 3846 | /* | ||
| 3847 | * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1 | ||
| 3848 | * and modify vmcs12 to make it see what it would expect to see there if | ||
| 3849 | * L2 was its real guest. Must only be called when in L2 (is_guest_mode()) | ||
| 3850 | */ | ||
| 3851 | void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, | ||
| 3852 | u32 exit_intr_info, unsigned long exit_qualification) | ||
| 3853 | { | ||
| 3854 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 3855 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); | ||
| 3856 | |||
| 3857 | /* trying to cancel vmlaunch/vmresume is a bug */ | ||
| 3858 | WARN_ON_ONCE(vmx->nested.nested_run_pending); | ||
| 3859 | |||
| 3860 | leave_guest_mode(vcpu); | ||
| 3861 | |||
| 3862 | if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) | ||
| 3863 | vcpu->arch.tsc_offset -= vmcs12->tsc_offset; | ||
| 3864 | |||
| 3865 | if (likely(!vmx->fail)) { | ||
| 3866 | if (exit_reason == -1) | ||
| 3867 | sync_vmcs12(vcpu, vmcs12); | ||
| 3868 | else | ||
| 3869 | prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info, | ||
| 3870 | exit_qualification); | ||
| 3871 | |||
| 3872 | /* | ||
| 3873 | * Must happen outside of sync_vmcs12() as it will | ||
| 3874 | * also be used to capture vmcs12 cache as part of | ||
| 3875 | * capturing nVMX state for snapshot (migration). | ||
| 3876 | * | ||
| 3877 | * Otherwise, this flush will dirty guest memory at a | ||
| 3878 | * point it is already assumed by user-space to be | ||
| 3879 | * immutable. | ||
| 3880 | */ | ||
| 3881 | nested_flush_cached_shadow_vmcs12(vcpu, vmcs12); | ||
| 3882 | } else { | ||
| 3883 | /* | ||
| 3884 | * The only expected VM-instruction error is "VM entry with | ||
| 3885 | * invalid control field(s)." Anything else indicates a | ||
| 3886 | * problem with L0. And we should never get here with a | ||
| 3887 | * VMFail of any type if early consistency checks are enabled. | ||
| 3888 | */ | ||
| 3889 | WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) != | ||
| 3890 | VMXERR_ENTRY_INVALID_CONTROL_FIELD); | ||
| 3891 | WARN_ON_ONCE(nested_early_check); | ||
| 3892 | } | ||
| 3893 | |||
| 3894 | vmx_switch_vmcs(vcpu, &vmx->vmcs01); | ||
| 3895 | |||
| 3896 | /* Update any VMCS fields that might have changed while L2 ran */ | ||
| 3897 | vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); | ||
| 3898 | vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); | ||
| 3899 | vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); | ||
| 3900 | |||
| 3901 | if (kvm_has_tsc_control) | ||
| 3902 | decache_tsc_multiplier(vmx); | ||
| 3903 | |||
| 3904 | if (vmx->nested.change_vmcs01_virtual_apic_mode) { | ||
| 3905 | vmx->nested.change_vmcs01_virtual_apic_mode = false; | ||
| 3906 | vmx_set_virtual_apic_mode(vcpu); | ||
| 3907 | } else if (!nested_cpu_has_ept(vmcs12) && | ||
| 3908 | nested_cpu_has2(vmcs12, | ||
| 3909 | SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { | ||
| 3910 | vmx_flush_tlb(vcpu, true); | ||
| 3911 | } | ||
| 3912 | |||
| 3913 | /* This is needed for same reason as it was needed in prepare_vmcs02 */ | ||
| 3914 | vmx->host_rsp = 0; | ||
| 3915 | |||
| 3916 | /* Unpin physical memory we referred to in vmcs02 */ | ||
| 3917 | if (vmx->nested.apic_access_page) { | ||
| 3918 | kvm_release_page_dirty(vmx->nested.apic_access_page); | ||
| 3919 | vmx->nested.apic_access_page = NULL; | ||
| 3920 | } | ||
| 3921 | if (vmx->nested.virtual_apic_page) { | ||
| 3922 | kvm_release_page_dirty(vmx->nested.virtual_apic_page); | ||
| 3923 | vmx->nested.virtual_apic_page = NULL; | ||
| 3924 | } | ||
| 3925 | if (vmx->nested.pi_desc_page) { | ||
| 3926 | kunmap(vmx->nested.pi_desc_page); | ||
| 3927 | kvm_release_page_dirty(vmx->nested.pi_desc_page); | ||
| 3928 | vmx->nested.pi_desc_page = NULL; | ||
| 3929 | vmx->nested.pi_desc = NULL; | ||
| 3930 | } | ||
| 3931 | |||
| 3932 | /* | ||
| 3933 | * We are now running in L2, mmu_notifier will force to reload the | ||
| 3934 | * page's hpa for L2 vmcs. Need to reload it for L1 before entering L1. | ||
| 3935 | */ | ||
| 3936 | kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); | ||
| 3937 | |||
| 3938 | if ((exit_reason != -1) && (enable_shadow_vmcs || vmx->nested.hv_evmcs)) | ||
| 3939 | vmx->nested.need_vmcs12_sync = true; | ||
| 3940 | |||
| 3941 | /* in case we halted in L2 */ | ||
| 3942 | vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; | ||
| 3943 | |||
| 3944 | if (likely(!vmx->fail)) { | ||
| 3945 | /* | ||
| 3946 | * TODO: SDM says that with acknowledge interrupt on | ||
| 3947 | * exit, bit 31 of the VM-exit interrupt information | ||
| 3948 | * (valid interrupt) is always set to 1 on | ||
| 3949 | * EXIT_REASON_EXTERNAL_INTERRUPT, so we shouldn't | ||
| 3950 | * need kvm_cpu_has_interrupt(). See the commit | ||
| 3951 | * message for details. | ||
| 3952 | */ | ||
| 3953 | if (nested_exit_intr_ack_set(vcpu) && | ||
| 3954 | exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT && | ||
| 3955 | kvm_cpu_has_interrupt(vcpu)) { | ||
| 3956 | int irq = kvm_cpu_get_interrupt(vcpu); | ||
| 3957 | WARN_ON(irq < 0); | ||
| 3958 | vmcs12->vm_exit_intr_info = irq | | ||
| 3959 | INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR; | ||
| 3960 | } | ||
| 3961 | |||
| 3962 | if (exit_reason != -1) | ||
| 3963 | trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason, | ||
| 3964 | vmcs12->exit_qualification, | ||
| 3965 | vmcs12->idt_vectoring_info_field, | ||
| 3966 | vmcs12->vm_exit_intr_info, | ||
| 3967 | vmcs12->vm_exit_intr_error_code, | ||
| 3968 | KVM_ISA_VMX); | ||
| 3969 | |||
| 3970 | load_vmcs12_host_state(vcpu, vmcs12); | ||
| 3971 | |||
| 3972 | return; | ||
| 3973 | } | ||
| 3974 | |||
| 3975 | /* | ||
| 3976 | * After an early L2 VM-entry failure, we're now back | ||
| 3977 | * in L1 which thinks it just finished a VMLAUNCH or | ||
| 3978 | * VMRESUME instruction, so we need to set the failure | ||
| 3979 | * flag and the VM-instruction error field of the VMCS | ||
| 3980 | * accordingly, and skip the emulated instruction. | ||
| 3981 | */ | ||
| 3982 | (void)nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); | ||
| 3983 | |||
| 3984 | /* | ||
| 3985 | * Restore L1's host state to KVM's software model. We're here | ||
| 3986 | * because a consistency check was caught by hardware, which | ||
| 3987 | * means some amount of guest state has been propagated to KVM's | ||
| 3988 | * model and needs to be unwound to the host's state. | ||
| 3989 | */ | ||
| 3990 | nested_vmx_restore_host_state(vcpu); | ||
| 3991 | |||
| 3992 | vmx->fail = 0; | ||
| 3993 | } | ||
| 3994 | |||
| 3995 | /* | ||
| 3996 | * Decode the memory-address operand of a vmx instruction, as recorded on an | ||
| 3997 | * exit caused by such an instruction (run by a guest hypervisor). | ||
| 3998 | * On success, returns 0. When the operand is invalid, returns 1 and throws | ||
| 3999 | * #UD or #GP. | ||
| 4000 | */ | ||
| 4001 | int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, | ||
| 4002 | u32 vmx_instruction_info, bool wr, gva_t *ret) | ||
| 4003 | { | ||
| 4004 | gva_t off; | ||
| 4005 | bool exn; | ||
| 4006 | struct kvm_segment s; | ||
| 4007 | |||
| 4008 | /* | ||
| 4009 | * According to Vol. 3B, "Information for VM Exits Due to Instruction | ||
| 4010 | * Execution", on an exit, vmx_instruction_info holds most of the | ||
| 4011 | * addressing components of the operand. Only the displacement part | ||
| 4012 | * is put in exit_qualification (see 3B, "Basic VM-Exit Information"). | ||
| 4013 | * For how an actual address is calculated from all these components, | ||
| 4014 | * refer to Vol. 1, "Operand Addressing". | ||
| 4015 | */ | ||
| 4016 | int scaling = vmx_instruction_info & 3; | ||
| 4017 | int addr_size = (vmx_instruction_info >> 7) & 7; | ||
| 4018 | bool is_reg = vmx_instruction_info & (1u << 10); | ||
| 4019 | int seg_reg = (vmx_instruction_info >> 15) & 7; | ||
| 4020 | int index_reg = (vmx_instruction_info >> 18) & 0xf; | ||
| 4021 | bool index_is_valid = !(vmx_instruction_info & (1u << 22)); | ||
| 4022 | int base_reg = (vmx_instruction_info >> 23) & 0xf; | ||
| 4023 | bool base_is_valid = !(vmx_instruction_info & (1u << 27)); | ||
| 4024 | |||
| 4025 | if (is_reg) { | ||
| 4026 | kvm_queue_exception(vcpu, UD_VECTOR); | ||
| 4027 | return 1; | ||
| 4028 | } | ||
| 4029 | |||
| 4030 | /* Addr = segment_base + offset */ | ||
| 4031 | /* offset = base + [index * scale] + displacement */ | ||
| 4032 | off = exit_qualification; /* holds the displacement */ | ||
| 4033 | if (base_is_valid) | ||
| 4034 | off += kvm_register_read(vcpu, base_reg); | ||
| 4035 | if (index_is_valid) | ||
| 4036 | off += kvm_register_read(vcpu, index_reg)<<scaling; | ||
| 4037 | vmx_get_segment(vcpu, &s, seg_reg); | ||
| 4038 | *ret = s.base + off; | ||
| 4039 | |||
| 4040 | if (addr_size == 1) /* 32 bit */ | ||
| 4041 | *ret &= 0xffffffff; | ||
| 4042 | |||
| 4043 | /* Checks for #GP/#SS exceptions. */ | ||
| 4044 | exn = false; | ||
| 4045 | if (is_long_mode(vcpu)) { | ||
| 4046 | /* Long mode: #GP(0)/#SS(0) if the memory address is in a | ||
| 4047 | * non-canonical form. This is the only check on the memory | ||
| 4048 | * destination for long mode! | ||
| 4049 | */ | ||
| 4050 | exn = is_noncanonical_address(*ret, vcpu); | ||
| 4051 | } else if (is_protmode(vcpu)) { | ||
| 4052 | /* Protected mode: apply checks for segment validity in the | ||
| 4053 | * following order: | ||
| 4054 | * - segment type check (#GP(0) may be thrown) | ||
| 4055 | * - usability check (#GP(0)/#SS(0)) | ||
| 4056 | * - limit check (#GP(0)/#SS(0)) | ||
| 4057 | */ | ||
| 4058 | if (wr) | ||
| 4059 | /* #GP(0) if the destination operand is located in a | ||
| 4060 | * read-only data segment or any code segment. | ||
| 4061 | */ | ||
| 4062 | exn = ((s.type & 0xa) == 0 || (s.type & 8)); | ||
| 4063 | else | ||
| 4064 | /* #GP(0) if the source operand is located in an | ||
| 4065 | * execute-only code segment | ||
| 4066 | */ | ||
| 4067 | exn = ((s.type & 0xa) == 8); | ||
| 4068 | if (exn) { | ||
| 4069 | kvm_queue_exception_e(vcpu, GP_VECTOR, 0); | ||
| 4070 | return 1; | ||
| 4071 | } | ||
| 4072 | /* Protected mode: #GP(0)/#SS(0) if the segment is unusable. | ||
| 4073 | */ | ||
| 4074 | exn = (s.unusable != 0); | ||
| 4075 | /* Protected mode: #GP(0)/#SS(0) if the memory | ||
| 4076 | * operand is outside the segment limit. | ||
| 4077 | */ | ||
| 4078 | exn = exn || (off + sizeof(u64) > s.limit); | ||
| 4079 | } | ||
| 4080 | if (exn) { | ||
| 4081 | kvm_queue_exception_e(vcpu, | ||
| 4082 | seg_reg == VCPU_SREG_SS ? | ||
| 4083 | SS_VECTOR : GP_VECTOR, | ||
| 4084 | 0); | ||
| 4085 | return 1; | ||
| 4086 | } | ||
| 4087 | |||
| 4088 | return 0; | ||
| 4089 | } | ||
| 4090 | |||
| 4091 | static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer) | ||
| 4092 | { | ||
| 4093 | gva_t gva; | ||
| 4094 | struct x86_exception e; | ||
| 4095 | |||
| 4096 | if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), | ||
| 4097 | vmcs_read32(VMX_INSTRUCTION_INFO), false, &gva)) | ||
| 4098 | return 1; | ||
| 4099 | |||
| 4100 | if (kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e)) { | ||
| 4101 | kvm_inject_page_fault(vcpu, &e); | ||
| 4102 | return 1; | ||
| 4103 | } | ||
| 4104 | |||
| 4105 | return 0; | ||
| 4106 | } | ||
| 4107 | |||
| 4108 | /* | ||
| 4109 | * Allocate a shadow VMCS and associate it with the currently loaded | ||
| 4110 | * VMCS, unless such a shadow VMCS already exists. The newly allocated | ||
| 4111 | * VMCS is also VMCLEARed, so that it is ready for use. | ||
| 4112 | */ | ||
| 4113 | static struct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu) | ||
| 4114 | { | ||
| 4115 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 4116 | struct loaded_vmcs *loaded_vmcs = vmx->loaded_vmcs; | ||
| 4117 | |||
| 4118 | /* | ||
| 4119 | * We should allocate a shadow vmcs for vmcs01 only when L1 | ||
| 4120 | * executes VMXON and free it when L1 executes VMXOFF. | ||
| 4121 | * As it is invalid to execute VMXON twice, we shouldn't reach | ||
| 4122 | * here when vmcs01 already have an allocated shadow vmcs. | ||
| 4123 | */ | ||
| 4124 | WARN_ON(loaded_vmcs == &vmx->vmcs01 && loaded_vmcs->shadow_vmcs); | ||
| 4125 | |||
| 4126 | if (!loaded_vmcs->shadow_vmcs) { | ||
| 4127 | loaded_vmcs->shadow_vmcs = alloc_vmcs(true); | ||
| 4128 | if (loaded_vmcs->shadow_vmcs) | ||
| 4129 | vmcs_clear(loaded_vmcs->shadow_vmcs); | ||
| 4130 | } | ||
| 4131 | return loaded_vmcs->shadow_vmcs; | ||
| 4132 | } | ||
| 4133 | |||
| 4134 | static int enter_vmx_operation(struct kvm_vcpu *vcpu) | ||
| 4135 | { | ||
| 4136 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 4137 | int r; | ||
| 4138 | |||
| 4139 | r = alloc_loaded_vmcs(&vmx->nested.vmcs02); | ||
| 4140 | if (r < 0) | ||
| 4141 | goto out_vmcs02; | ||
| 4142 | |||
| 4143 | vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL); | ||
| 4144 | if (!vmx->nested.cached_vmcs12) | ||
| 4145 | goto out_cached_vmcs12; | ||
| 4146 | |||
| 4147 | vmx->nested.cached_shadow_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL); | ||
| 4148 | if (!vmx->nested.cached_shadow_vmcs12) | ||
| 4149 | goto out_cached_shadow_vmcs12; | ||
| 4150 | |||
| 4151 | if (enable_shadow_vmcs && !alloc_shadow_vmcs(vcpu)) | ||
| 4152 | goto out_shadow_vmcs; | ||
| 4153 | |||
| 4154 | hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC, | ||
| 4155 | HRTIMER_MODE_REL_PINNED); | ||
| 4156 | vmx->nested.preemption_timer.function = vmx_preemption_timer_fn; | ||
| 4157 | |||
| 4158 | vmx->nested.vpid02 = allocate_vpid(); | ||
| 4159 | |||
| 4160 | vmx->nested.vmcs02_initialized = false; | ||
| 4161 | vmx->nested.vmxon = true; | ||
| 4162 | |||
| 4163 | if (pt_mode == PT_MODE_HOST_GUEST) { | ||
| 4164 | vmx->pt_desc.guest.ctl = 0; | ||
| 4165 | pt_update_intercept_for_msr(vmx); | ||
| 4166 | } | ||
| 4167 | |||
| 4168 | return 0; | ||
| 4169 | |||
| 4170 | out_shadow_vmcs: | ||
| 4171 | kfree(vmx->nested.cached_shadow_vmcs12); | ||
| 4172 | |||
| 4173 | out_cached_shadow_vmcs12: | ||
| 4174 | kfree(vmx->nested.cached_vmcs12); | ||
| 4175 | |||
| 4176 | out_cached_vmcs12: | ||
| 4177 | free_loaded_vmcs(&vmx->nested.vmcs02); | ||
| 4178 | |||
| 4179 | out_vmcs02: | ||
| 4180 | return -ENOMEM; | ||
| 4181 | } | ||
| 4182 | |||
| 4183 | /* | ||
| 4184 | * Emulate the VMXON instruction. | ||
| 4185 | * Currently, we just remember that VMX is active, and do not save or even | ||
| 4186 | * inspect the argument to VMXON (the so-called "VMXON pointer") because we | ||
| 4187 | * do not currently need to store anything in that guest-allocated memory | ||
| 4188 | * region. Consequently, VMCLEAR and VMPTRLD also do not verify that the their | ||
| 4189 | * argument is different from the VMXON pointer (which the spec says they do). | ||
| 4190 | */ | ||
| 4191 | static int handle_vmon(struct kvm_vcpu *vcpu) | ||
| 4192 | { | ||
| 4193 | int ret; | ||
| 4194 | gpa_t vmptr; | ||
| 4195 | struct page *page; | ||
| 4196 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 4197 | const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED | ||
| 4198 | | FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; | ||
| 4199 | |||
| 4200 | /* | ||
| 4201 | * The Intel VMX Instruction Reference lists a bunch of bits that are | ||
| 4202 | * prerequisite to running VMXON, most notably cr4.VMXE must be set to | ||
| 4203 | * 1 (see vmx_set_cr4() for when we allow the guest to set this). | ||
| 4204 | * Otherwise, we should fail with #UD. But most faulting conditions | ||
| 4205 | * have already been checked by hardware, prior to the VM-exit for | ||
| 4206 | * VMXON. We do test guest cr4.VMXE because processor CR4 always has | ||
| 4207 | * that bit set to 1 in non-root mode. | ||
| 4208 | */ | ||
| 4209 | if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE)) { | ||
| 4210 | kvm_queue_exception(vcpu, UD_VECTOR); | ||
| 4211 | return 1; | ||
| 4212 | } | ||
| 4213 | |||
| 4214 | /* CPL=0 must be checked manually. */ | ||
| 4215 | if (vmx_get_cpl(vcpu)) { | ||
| 4216 | kvm_inject_gp(vcpu, 0); | ||
| 4217 | return 1; | ||
| 4218 | } | ||
| 4219 | |||
| 4220 | if (vmx->nested.vmxon) | ||
| 4221 | return nested_vmx_failValid(vcpu, | ||
| 4222 | VMXERR_VMXON_IN_VMX_ROOT_OPERATION); | ||
| 4223 | |||
| 4224 | if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES) | ||
| 4225 | != VMXON_NEEDED_FEATURES) { | ||
| 4226 | kvm_inject_gp(vcpu, 0); | ||
| 4227 | return 1; | ||
| 4228 | } | ||
| 4229 | |||
| 4230 | if (nested_vmx_get_vmptr(vcpu, &vmptr)) | ||
| 4231 | return 1; | ||
| 4232 | |||
| 4233 | /* | ||
| 4234 | * SDM 3: 24.11.5 | ||
| 4235 | * The first 4 bytes of VMXON region contain the supported | ||
| 4236 | * VMCS revision identifier | ||
| 4237 | * | ||
| 4238 | * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case; | ||
| 4239 | * which replaces physical address width with 32 | ||
| 4240 | */ | ||
| 4241 | if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) | ||
| 4242 | return nested_vmx_failInvalid(vcpu); | ||
| 4243 | |||
| 4244 | page = kvm_vcpu_gpa_to_page(vcpu, vmptr); | ||
| 4245 | if (is_error_page(page)) | ||
| 4246 | return nested_vmx_failInvalid(vcpu); | ||
| 4247 | |||
| 4248 | if (*(u32 *)kmap(page) != VMCS12_REVISION) { | ||
| 4249 | kunmap(page); | ||
| 4250 | kvm_release_page_clean(page); | ||
| 4251 | return nested_vmx_failInvalid(vcpu); | ||
| 4252 | } | ||
| 4253 | kunmap(page); | ||
| 4254 | kvm_release_page_clean(page); | ||
| 4255 | |||
| 4256 | vmx->nested.vmxon_ptr = vmptr; | ||
| 4257 | ret = enter_vmx_operation(vcpu); | ||
| 4258 | if (ret) | ||
| 4259 | return ret; | ||
| 4260 | |||
| 4261 | return nested_vmx_succeed(vcpu); | ||
| 4262 | } | ||
| 4263 | |||
| 4264 | static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu) | ||
| 4265 | { | ||
| 4266 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 4267 | |||
| 4268 | if (vmx->nested.current_vmptr == -1ull) | ||
| 4269 | return; | ||
| 4270 | |||
| 4271 | if (enable_shadow_vmcs) { | ||
| 4272 | /* copy to memory all shadowed fields in case | ||
| 4273 | they were modified */ | ||
| 4274 | copy_shadow_to_vmcs12(vmx); | ||
| 4275 | vmx->nested.need_vmcs12_sync = false; | ||
| 4276 | vmx_disable_shadow_vmcs(vmx); | ||
| 4277 | } | ||
| 4278 | vmx->nested.posted_intr_nv = -1; | ||
| 4279 | |||
| 4280 | /* Flush VMCS12 to guest memory */ | ||
| 4281 | kvm_vcpu_write_guest_page(vcpu, | ||
| 4282 | vmx->nested.current_vmptr >> PAGE_SHIFT, | ||
| 4283 | vmx->nested.cached_vmcs12, 0, VMCS12_SIZE); | ||
| 4284 | |||
| 4285 | kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); | ||
| 4286 | |||
| 4287 | vmx->nested.current_vmptr = -1ull; | ||
| 4288 | } | ||
| 4289 | |||
| 4290 | /* Emulate the VMXOFF instruction */ | ||
| 4291 | static int handle_vmoff(struct kvm_vcpu *vcpu) | ||
| 4292 | { | ||
| 4293 | if (!nested_vmx_check_permission(vcpu)) | ||
| 4294 | return 1; | ||
| 4295 | free_nested(vcpu); | ||
| 4296 | return nested_vmx_succeed(vcpu); | ||
| 4297 | } | ||
| 4298 | |||
| 4299 | /* Emulate the VMCLEAR instruction */ | ||
| 4300 | static int handle_vmclear(struct kvm_vcpu *vcpu) | ||
| 4301 | { | ||
| 4302 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 4303 | u32 zero = 0; | ||
| 4304 | gpa_t vmptr; | ||
| 4305 | |||
| 4306 | if (!nested_vmx_check_permission(vcpu)) | ||
| 4307 | return 1; | ||
| 4308 | |||
| 4309 | if (nested_vmx_get_vmptr(vcpu, &vmptr)) | ||
| 4310 | return 1; | ||
| 4311 | |||
| 4312 | if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) | ||
| 4313 | return nested_vmx_failValid(vcpu, | ||
| 4314 | VMXERR_VMCLEAR_INVALID_ADDRESS); | ||
| 4315 | |||
| 4316 | if (vmptr == vmx->nested.vmxon_ptr) | ||
| 4317 | return nested_vmx_failValid(vcpu, | ||
| 4318 | VMXERR_VMCLEAR_VMXON_POINTER); | ||
| 4319 | |||
| 4320 | if (vmx->nested.hv_evmcs_page) { | ||
| 4321 | if (vmptr == vmx->nested.hv_evmcs_vmptr) | ||
| 4322 | nested_release_evmcs(vcpu); | ||
| 4323 | } else { | ||
| 4324 | if (vmptr == vmx->nested.current_vmptr) | ||
| 4325 | nested_release_vmcs12(vcpu); | ||
| 4326 | |||
| 4327 | kvm_vcpu_write_guest(vcpu, | ||
| 4328 | vmptr + offsetof(struct vmcs12, | ||
| 4329 | launch_state), | ||
| 4330 | &zero, sizeof(zero)); | ||
| 4331 | } | ||
| 4332 | |||
| 4333 | return nested_vmx_succeed(vcpu); | ||
| 4334 | } | ||
| 4335 | |||
| 4336 | static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch); | ||
| 4337 | |||
| 4338 | /* Emulate the VMLAUNCH instruction */ | ||
| 4339 | static int handle_vmlaunch(struct kvm_vcpu *vcpu) | ||
| 4340 | { | ||
| 4341 | return nested_vmx_run(vcpu, true); | ||
| 4342 | } | ||
| 4343 | |||
| 4344 | /* Emulate the VMRESUME instruction */ | ||
| 4345 | static int handle_vmresume(struct kvm_vcpu *vcpu) | ||
| 4346 | { | ||
| 4347 | |||
| 4348 | return nested_vmx_run(vcpu, false); | ||
| 4349 | } | ||
| 4350 | |||
| 4351 | static int handle_vmread(struct kvm_vcpu *vcpu) | ||
| 4352 | { | ||
| 4353 | unsigned long field; | ||
| 4354 | u64 field_value; | ||
| 4355 | unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); | ||
| 4356 | u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); | ||
| 4357 | gva_t gva = 0; | ||
| 4358 | struct vmcs12 *vmcs12; | ||
| 4359 | |||
| 4360 | if (!nested_vmx_check_permission(vcpu)) | ||
| 4361 | return 1; | ||
| 4362 | |||
| 4363 | if (to_vmx(vcpu)->nested.current_vmptr == -1ull) | ||
| 4364 | return nested_vmx_failInvalid(vcpu); | ||
| 4365 | |||
| 4366 | if (!is_guest_mode(vcpu)) | ||
| 4367 | vmcs12 = get_vmcs12(vcpu); | ||
| 4368 | else { | ||
| 4369 | /* | ||
| 4370 | * When vmcs->vmcs_link_pointer is -1ull, any VMREAD | ||
| 4371 | * to shadowed-field sets the ALU flags for VMfailInvalid. | ||
| 4372 | */ | ||
| 4373 | if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull) | ||
| 4374 | return nested_vmx_failInvalid(vcpu); | ||
| 4375 | vmcs12 = get_shadow_vmcs12(vcpu); | ||
| 4376 | } | ||
| 4377 | |||
| 4378 | /* Decode instruction info and find the field to read */ | ||
| 4379 | field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); | ||
| 4380 | /* Read the field, zero-extended to a u64 field_value */ | ||
| 4381 | if (vmcs12_read_any(vmcs12, field, &field_value) < 0) | ||
| 4382 | return nested_vmx_failValid(vcpu, | ||
| 4383 | VMXERR_UNSUPPORTED_VMCS_COMPONENT); | ||
| 4384 | |||
| 4385 | /* | ||
| 4386 | * Now copy part of this value to register or memory, as requested. | ||
| 4387 | * Note that the number of bits actually copied is 32 or 64 depending | ||
| 4388 | * on the guest's mode (32 or 64 bit), not on the given field's length. | ||
| 4389 | */ | ||
| 4390 | if (vmx_instruction_info & (1u << 10)) { | ||
| 4391 | kvm_register_writel(vcpu, (((vmx_instruction_info) >> 3) & 0xf), | ||
| 4392 | field_value); | ||
| 4393 | } else { | ||
| 4394 | if (get_vmx_mem_address(vcpu, exit_qualification, | ||
| 4395 | vmx_instruction_info, true, &gva)) | ||
| 4396 | return 1; | ||
| 4397 | /* _system ok, nested_vmx_check_permission has verified cpl=0 */ | ||
| 4398 | kvm_write_guest_virt_system(vcpu, gva, &field_value, | ||
| 4399 | (is_long_mode(vcpu) ? 8 : 4), NULL); | ||
| 4400 | } | ||
| 4401 | |||
| 4402 | return nested_vmx_succeed(vcpu); | ||
| 4403 | } | ||
| 4404 | |||
| 4405 | |||
| 4406 | static int handle_vmwrite(struct kvm_vcpu *vcpu) | ||
| 4407 | { | ||
| 4408 | unsigned long field; | ||
| 4409 | gva_t gva; | ||
| 4410 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 4411 | unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); | ||
| 4412 | u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); | ||
| 4413 | |||
| 4414 | /* The value to write might be 32 or 64 bits, depending on L1's long | ||
| 4415 | * mode, and eventually we need to write that into a field of several | ||
| 4416 | * possible lengths. The code below first zero-extends the value to 64 | ||
| 4417 | * bit (field_value), and then copies only the appropriate number of | ||
| 4418 | * bits into the vmcs12 field. | ||
| 4419 | */ | ||
| 4420 | u64 field_value = 0; | ||
| 4421 | struct x86_exception e; | ||
| 4422 | struct vmcs12 *vmcs12; | ||
| 4423 | |||
| 4424 | if (!nested_vmx_check_permission(vcpu)) | ||
| 4425 | return 1; | ||
| 4426 | |||
| 4427 | if (vmx->nested.current_vmptr == -1ull) | ||
| 4428 | return nested_vmx_failInvalid(vcpu); | ||
| 4429 | |||
| 4430 | if (vmx_instruction_info & (1u << 10)) | ||
| 4431 | field_value = kvm_register_readl(vcpu, | ||
| 4432 | (((vmx_instruction_info) >> 3) & 0xf)); | ||
| 4433 | else { | ||
| 4434 | if (get_vmx_mem_address(vcpu, exit_qualification, | ||
| 4435 | vmx_instruction_info, false, &gva)) | ||
| 4436 | return 1; | ||
| 4437 | if (kvm_read_guest_virt(vcpu, gva, &field_value, | ||
| 4438 | (is_64_bit_mode(vcpu) ? 8 : 4), &e)) { | ||
| 4439 | kvm_inject_page_fault(vcpu, &e); | ||
| 4440 | return 1; | ||
| 4441 | } | ||
| 4442 | } | ||
| 4443 | |||
| 4444 | |||
| 4445 | field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); | ||
| 4446 | /* | ||
| 4447 | * If the vCPU supports "VMWRITE to any supported field in the | ||
| 4448 | * VMCS," then the "read-only" fields are actually read/write. | ||
| 4449 | */ | ||
| 4450 | if (vmcs_field_readonly(field) && | ||
| 4451 | !nested_cpu_has_vmwrite_any_field(vcpu)) | ||
| 4452 | return nested_vmx_failValid(vcpu, | ||
| 4453 | VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT); | ||
| 4454 | |||
| 4455 | if (!is_guest_mode(vcpu)) | ||
| 4456 | vmcs12 = get_vmcs12(vcpu); | ||
| 4457 | else { | ||
| 4458 | /* | ||
| 4459 | * When vmcs->vmcs_link_pointer is -1ull, any VMWRITE | ||
| 4460 | * to shadowed-field sets the ALU flags for VMfailInvalid. | ||
| 4461 | */ | ||
| 4462 | if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull) | ||
| 4463 | return nested_vmx_failInvalid(vcpu); | ||
| 4464 | vmcs12 = get_shadow_vmcs12(vcpu); | ||
| 4465 | } | ||
| 4466 | |||
| 4467 | if (vmcs12_write_any(vmcs12, field, field_value) < 0) | ||
| 4468 | return nested_vmx_failValid(vcpu, | ||
| 4469 | VMXERR_UNSUPPORTED_VMCS_COMPONENT); | ||
| 4470 | |||
| 4471 | /* | ||
| 4472 | * Do not track vmcs12 dirty-state if in guest-mode | ||
| 4473 | * as we actually dirty shadow vmcs12 instead of vmcs12. | ||
| 4474 | */ | ||
| 4475 | if (!is_guest_mode(vcpu)) { | ||
| 4476 | switch (field) { | ||
| 4477 | #define SHADOW_FIELD_RW(x) case x: | ||
| 4478 | #include "vmcs_shadow_fields.h" | ||
| 4479 | /* | ||
| 4480 | * The fields that can be updated by L1 without a vmexit are | ||
| 4481 | * always updated in the vmcs02, the others go down the slow | ||
| 4482 | * path of prepare_vmcs02. | ||
| 4483 | */ | ||
| 4484 | break; | ||
| 4485 | default: | ||
| 4486 | vmx->nested.dirty_vmcs12 = true; | ||
| 4487 | break; | ||
| 4488 | } | ||
| 4489 | } | ||
| 4490 | |||
| 4491 | return nested_vmx_succeed(vcpu); | ||
| 4492 | } | ||
| 4493 | |||
| 4494 | static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr) | ||
| 4495 | { | ||
| 4496 | vmx->nested.current_vmptr = vmptr; | ||
| 4497 | if (enable_shadow_vmcs) { | ||
| 4498 | vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL, | ||
| 4499 | SECONDARY_EXEC_SHADOW_VMCS); | ||
| 4500 | vmcs_write64(VMCS_LINK_POINTER, | ||
| 4501 | __pa(vmx->vmcs01.shadow_vmcs)); | ||
| 4502 | vmx->nested.need_vmcs12_sync = true; | ||
| 4503 | } | ||
| 4504 | vmx->nested.dirty_vmcs12 = true; | ||
| 4505 | } | ||
| 4506 | |||
| 4507 | /* Emulate the VMPTRLD instruction */ | ||
| 4508 | static int handle_vmptrld(struct kvm_vcpu *vcpu) | ||
| 4509 | { | ||
| 4510 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 4511 | gpa_t vmptr; | ||
| 4512 | |||
| 4513 | if (!nested_vmx_check_permission(vcpu)) | ||
| 4514 | return 1; | ||
| 4515 | |||
| 4516 | if (nested_vmx_get_vmptr(vcpu, &vmptr)) | ||
| 4517 | return 1; | ||
| 4518 | |||
| 4519 | if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) | ||
| 4520 | return nested_vmx_failValid(vcpu, | ||
| 4521 | VMXERR_VMPTRLD_INVALID_ADDRESS); | ||
| 4522 | |||
| 4523 | if (vmptr == vmx->nested.vmxon_ptr) | ||
| 4524 | return nested_vmx_failValid(vcpu, | ||
| 4525 | VMXERR_VMPTRLD_VMXON_POINTER); | ||
| 4526 | |||
| 4527 | /* Forbid normal VMPTRLD if Enlightened version was used */ | ||
| 4528 | if (vmx->nested.hv_evmcs) | ||
| 4529 | return 1; | ||
| 4530 | |||
| 4531 | if (vmx->nested.current_vmptr != vmptr) { | ||
| 4532 | struct vmcs12 *new_vmcs12; | ||
| 4533 | struct page *page; | ||
| 4534 | |||
| 4535 | page = kvm_vcpu_gpa_to_page(vcpu, vmptr); | ||
| 4536 | if (is_error_page(page)) { | ||
| 4537 | /* | ||
| 4538 | * Reads from an unbacked page return all 1s, | ||
| 4539 | * which means that the 32 bits located at the | ||
| 4540 | * given physical address won't match the required | ||
| 4541 | * VMCS12_REVISION identifier. | ||
| 4542 | */ | ||
| 4543 | nested_vmx_failValid(vcpu, | ||
| 4544 | VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); | ||
| 4545 | return kvm_skip_emulated_instruction(vcpu); | ||
| 4546 | } | ||
| 4547 | new_vmcs12 = kmap(page); | ||
| 4548 | if (new_vmcs12->hdr.revision_id != VMCS12_REVISION || | ||
| 4549 | (new_vmcs12->hdr.shadow_vmcs && | ||
| 4550 | !nested_cpu_has_vmx_shadow_vmcs(vcpu))) { | ||
| 4551 | kunmap(page); | ||
| 4552 | kvm_release_page_clean(page); | ||
| 4553 | return nested_vmx_failValid(vcpu, | ||
| 4554 | VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); | ||
| 4555 | } | ||
| 4556 | |||
| 4557 | nested_release_vmcs12(vcpu); | ||
| 4558 | |||
| 4559 | /* | ||
| 4560 | * Load VMCS12 from guest memory since it is not already | ||
| 4561 | * cached. | ||
| 4562 | */ | ||
| 4563 | memcpy(vmx->nested.cached_vmcs12, new_vmcs12, VMCS12_SIZE); | ||
| 4564 | kunmap(page); | ||
| 4565 | kvm_release_page_clean(page); | ||
| 4566 | |||
| 4567 | set_current_vmptr(vmx, vmptr); | ||
| 4568 | } | ||
| 4569 | |||
| 4570 | return nested_vmx_succeed(vcpu); | ||
| 4571 | } | ||
| 4572 | |||
| 4573 | /* Emulate the VMPTRST instruction */ | ||
| 4574 | static int handle_vmptrst(struct kvm_vcpu *vcpu) | ||
| 4575 | { | ||
| 4576 | unsigned long exit_qual = vmcs_readl(EXIT_QUALIFICATION); | ||
| 4577 | u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); | ||
| 4578 | gpa_t current_vmptr = to_vmx(vcpu)->nested.current_vmptr; | ||
| 4579 | struct x86_exception e; | ||
| 4580 | gva_t gva; | ||
| 4581 | |||
| 4582 | if (!nested_vmx_check_permission(vcpu)) | ||
| 4583 | return 1; | ||
| 4584 | |||
| 4585 | if (unlikely(to_vmx(vcpu)->nested.hv_evmcs)) | ||
| 4586 | return 1; | ||
| 4587 | |||
| 4588 | if (get_vmx_mem_address(vcpu, exit_qual, instr_info, true, &gva)) | ||
| 4589 | return 1; | ||
| 4590 | /* *_system ok, nested_vmx_check_permission has verified cpl=0 */ | ||
| 4591 | if (kvm_write_guest_virt_system(vcpu, gva, (void *)¤t_vmptr, | ||
| 4592 | sizeof(gpa_t), &e)) { | ||
| 4593 | kvm_inject_page_fault(vcpu, &e); | ||
| 4594 | return 1; | ||
| 4595 | } | ||
| 4596 | return nested_vmx_succeed(vcpu); | ||
| 4597 | } | ||
| 4598 | |||
| 4599 | /* Emulate the INVEPT instruction */ | ||
| 4600 | static int handle_invept(struct kvm_vcpu *vcpu) | ||
| 4601 | { | ||
| 4602 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 4603 | u32 vmx_instruction_info, types; | ||
| 4604 | unsigned long type; | ||
| 4605 | gva_t gva; | ||
| 4606 | struct x86_exception e; | ||
| 4607 | struct { | ||
| 4608 | u64 eptp, gpa; | ||
| 4609 | } operand; | ||
| 4610 | |||
| 4611 | if (!(vmx->nested.msrs.secondary_ctls_high & | ||
| 4612 | SECONDARY_EXEC_ENABLE_EPT) || | ||
| 4613 | !(vmx->nested.msrs.ept_caps & VMX_EPT_INVEPT_BIT)) { | ||
| 4614 | kvm_queue_exception(vcpu, UD_VECTOR); | ||
| 4615 | return 1; | ||
| 4616 | } | ||
| 4617 | |||
| 4618 | if (!nested_vmx_check_permission(vcpu)) | ||
| 4619 | return 1; | ||
| 4620 | |||
| 4621 | vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); | ||
| 4622 | type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf); | ||
| 4623 | |||
| 4624 | types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6; | ||
| 4625 | |||
| 4626 | if (type >= 32 || !(types & (1 << type))) | ||
| 4627 | return nested_vmx_failValid(vcpu, | ||
| 4628 | VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); | ||
| 4629 | |||
| 4630 | /* According to the Intel VMX instruction reference, the memory | ||
| 4631 | * operand is read even if it isn't needed (e.g., for type==global) | ||
| 4632 | */ | ||
| 4633 | if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), | ||
| 4634 | vmx_instruction_info, false, &gva)) | ||
| 4635 | return 1; | ||
| 4636 | if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) { | ||
| 4637 | kvm_inject_page_fault(vcpu, &e); | ||
| 4638 | return 1; | ||
| 4639 | } | ||
| 4640 | |||
| 4641 | switch (type) { | ||
| 4642 | case VMX_EPT_EXTENT_GLOBAL: | ||
| 4643 | /* | ||
| 4644 | * TODO: track mappings and invalidate | ||
| 4645 | * single context requests appropriately | ||
| 4646 | */ | ||
| 4647 | case VMX_EPT_EXTENT_CONTEXT: | ||
| 4648 | kvm_mmu_sync_roots(vcpu); | ||
| 4649 | kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); | ||
| 4650 | break; | ||
| 4651 | default: | ||
| 4652 | BUG_ON(1); | ||
| 4653 | break; | ||
| 4654 | } | ||
| 4655 | |||
| 4656 | return nested_vmx_succeed(vcpu); | ||
| 4657 | } | ||
| 4658 | |||
| 4659 | static int handle_invvpid(struct kvm_vcpu *vcpu) | ||
| 4660 | { | ||
| 4661 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 4662 | u32 vmx_instruction_info; | ||
| 4663 | unsigned long type, types; | ||
| 4664 | gva_t gva; | ||
| 4665 | struct x86_exception e; | ||
| 4666 | struct { | ||
| 4667 | u64 vpid; | ||
| 4668 | u64 gla; | ||
| 4669 | } operand; | ||
| 4670 | u16 vpid02; | ||
| 4671 | |||
| 4672 | if (!(vmx->nested.msrs.secondary_ctls_high & | ||
| 4673 | SECONDARY_EXEC_ENABLE_VPID) || | ||
| 4674 | !(vmx->nested.msrs.vpid_caps & VMX_VPID_INVVPID_BIT)) { | ||
| 4675 | kvm_queue_exception(vcpu, UD_VECTOR); | ||
| 4676 | return 1; | ||
| 4677 | } | ||
| 4678 | |||
| 4679 | if (!nested_vmx_check_permission(vcpu)) | ||
| 4680 | return 1; | ||
| 4681 | |||
| 4682 | vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); | ||
| 4683 | type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf); | ||
| 4684 | |||
| 4685 | types = (vmx->nested.msrs.vpid_caps & | ||
| 4686 | VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8; | ||
| 4687 | |||
| 4688 | if (type >= 32 || !(types & (1 << type))) | ||
| 4689 | return nested_vmx_failValid(vcpu, | ||
| 4690 | VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); | ||
| 4691 | |||
| 4692 | /* according to the intel vmx instruction reference, the memory | ||
| 4693 | * operand is read even if it isn't needed (e.g., for type==global) | ||
| 4694 | */ | ||
| 4695 | if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), | ||
| 4696 | vmx_instruction_info, false, &gva)) | ||
| 4697 | return 1; | ||
| 4698 | if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) { | ||
| 4699 | kvm_inject_page_fault(vcpu, &e); | ||
| 4700 | return 1; | ||
| 4701 | } | ||
| 4702 | if (operand.vpid >> 16) | ||
| 4703 | return nested_vmx_failValid(vcpu, | ||
| 4704 | VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); | ||
| 4705 | |||
| 4706 | vpid02 = nested_get_vpid02(vcpu); | ||
| 4707 | switch (type) { | ||
| 4708 | case VMX_VPID_EXTENT_INDIVIDUAL_ADDR: | ||
| 4709 | if (!operand.vpid || | ||
| 4710 | is_noncanonical_address(operand.gla, vcpu)) | ||
| 4711 | return nested_vmx_failValid(vcpu, | ||
| 4712 | VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); | ||
| 4713 | if (cpu_has_vmx_invvpid_individual_addr()) { | ||
| 4714 | __invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR, | ||
| 4715 | vpid02, operand.gla); | ||
| 4716 | } else | ||
| 4717 | __vmx_flush_tlb(vcpu, vpid02, false); | ||
| 4718 | break; | ||
| 4719 | case VMX_VPID_EXTENT_SINGLE_CONTEXT: | ||
| 4720 | case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL: | ||
| 4721 | if (!operand.vpid) | ||
| 4722 | return nested_vmx_failValid(vcpu, | ||
| 4723 | VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); | ||
| 4724 | __vmx_flush_tlb(vcpu, vpid02, false); | ||
| 4725 | break; | ||
| 4726 | case VMX_VPID_EXTENT_ALL_CONTEXT: | ||
| 4727 | __vmx_flush_tlb(vcpu, vpid02, false); | ||
| 4728 | break; | ||
| 4729 | default: | ||
| 4730 | WARN_ON_ONCE(1); | ||
| 4731 | return kvm_skip_emulated_instruction(vcpu); | ||
| 4732 | } | ||
| 4733 | |||
| 4734 | return nested_vmx_succeed(vcpu); | ||
| 4735 | } | ||
| 4736 | |||
| 4737 | static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu, | ||
| 4738 | struct vmcs12 *vmcs12) | ||
| 4739 | { | ||
| 4740 | u32 index = vcpu->arch.regs[VCPU_REGS_RCX]; | ||
| 4741 | u64 address; | ||
| 4742 | bool accessed_dirty; | ||
| 4743 | struct kvm_mmu *mmu = vcpu->arch.walk_mmu; | ||
| 4744 | |||
| 4745 | if (!nested_cpu_has_eptp_switching(vmcs12) || | ||
| 4746 | !nested_cpu_has_ept(vmcs12)) | ||
| 4747 | return 1; | ||
| 4748 | |||
| 4749 | if (index >= VMFUNC_EPTP_ENTRIES) | ||
| 4750 | return 1; | ||
| 4751 | |||
| 4752 | |||
| 4753 | if (kvm_vcpu_read_guest_page(vcpu, vmcs12->eptp_list_address >> PAGE_SHIFT, | ||
| 4754 | &address, index * 8, 8)) | ||
| 4755 | return 1; | ||
| 4756 | |||
| 4757 | accessed_dirty = !!(address & VMX_EPTP_AD_ENABLE_BIT); | ||
| 4758 | |||
| 4759 | /* | ||
| 4760 | * If the (L2) guest does a vmfunc to the currently | ||
| 4761 | * active ept pointer, we don't have to do anything else | ||
| 4762 | */ | ||
| 4763 | if (vmcs12->ept_pointer != address) { | ||
| 4764 | if (!valid_ept_address(vcpu, address)) | ||
| 4765 | return 1; | ||
| 4766 | |||
| 4767 | kvm_mmu_unload(vcpu); | ||
| 4768 | mmu->ept_ad = accessed_dirty; | ||
| 4769 | mmu->mmu_role.base.ad_disabled = !accessed_dirty; | ||
| 4770 | vmcs12->ept_pointer = address; | ||
| 4771 | /* | ||
| 4772 | * TODO: Check what's the correct approach in case | ||
| 4773 | * mmu reload fails. Currently, we just let the next | ||
| 4774 | * reload potentially fail | ||
| 4775 | */ | ||
| 4776 | kvm_mmu_reload(vcpu); | ||
| 4777 | } | ||
| 4778 | |||
| 4779 | return 0; | ||
| 4780 | } | ||
| 4781 | |||
| 4782 | static int handle_vmfunc(struct kvm_vcpu *vcpu) | ||
| 4783 | { | ||
| 4784 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 4785 | struct vmcs12 *vmcs12; | ||
| 4786 | u32 function = vcpu->arch.regs[VCPU_REGS_RAX]; | ||
| 4787 | |||
| 4788 | /* | ||
| 4789 | * VMFUNC is only supported for nested guests, but we always enable the | ||
| 4790 | * secondary control for simplicity; for non-nested mode, fake that we | ||
| 4791 | * didn't by injecting #UD. | ||
| 4792 | */ | ||
| 4793 | if (!is_guest_mode(vcpu)) { | ||
| 4794 | kvm_queue_exception(vcpu, UD_VECTOR); | ||
| 4795 | return 1; | ||
| 4796 | } | ||
| 4797 | |||
| 4798 | vmcs12 = get_vmcs12(vcpu); | ||
| 4799 | if ((vmcs12->vm_function_control & (1 << function)) == 0) | ||
| 4800 | goto fail; | ||
| 4801 | |||
| 4802 | switch (function) { | ||
| 4803 | case 0: | ||
| 4804 | if (nested_vmx_eptp_switching(vcpu, vmcs12)) | ||
| 4805 | goto fail; | ||
| 4806 | break; | ||
| 4807 | default: | ||
| 4808 | goto fail; | ||
| 4809 | } | ||
| 4810 | return kvm_skip_emulated_instruction(vcpu); | ||
| 4811 | |||
| 4812 | fail: | ||
| 4813 | nested_vmx_vmexit(vcpu, vmx->exit_reason, | ||
| 4814 | vmcs_read32(VM_EXIT_INTR_INFO), | ||
| 4815 | vmcs_readl(EXIT_QUALIFICATION)); | ||
| 4816 | return 1; | ||
| 4817 | } | ||
| 4818 | |||
| 4819 | |||
| 4820 | static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu, | ||
| 4821 | struct vmcs12 *vmcs12) | ||
| 4822 | { | ||
| 4823 | unsigned long exit_qualification; | ||
| 4824 | gpa_t bitmap, last_bitmap; | ||
| 4825 | unsigned int port; | ||
| 4826 | int size; | ||
| 4827 | u8 b; | ||
| 4828 | |||
| 4829 | if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) | ||
| 4830 | return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING); | ||
| 4831 | |||
| 4832 | exit_qualification = vmcs_readl(EXIT_QUALIFICATION); | ||
| 4833 | |||
| 4834 | port = exit_qualification >> 16; | ||
| 4835 | size = (exit_qualification & 7) + 1; | ||
| 4836 | |||
| 4837 | last_bitmap = (gpa_t)-1; | ||
| 4838 | b = -1; | ||
| 4839 | |||
| 4840 | while (size > 0) { | ||
| 4841 | if (port < 0x8000) | ||
| 4842 | bitmap = vmcs12->io_bitmap_a; | ||
| 4843 | else if (port < 0x10000) | ||
| 4844 | bitmap = vmcs12->io_bitmap_b; | ||
| 4845 | else | ||
| 4846 | return true; | ||
| 4847 | bitmap += (port & 0x7fff) / 8; | ||
| 4848 | |||
| 4849 | if (last_bitmap != bitmap) | ||
| 4850 | if (kvm_vcpu_read_guest(vcpu, bitmap, &b, 1)) | ||
| 4851 | return true; | ||
| 4852 | if (b & (1 << (port & 7))) | ||
| 4853 | return true; | ||
| 4854 | |||
| 4855 | port++; | ||
| 4856 | size--; | ||
| 4857 | last_bitmap = bitmap; | ||
| 4858 | } | ||
| 4859 | |||
| 4860 | return false; | ||
| 4861 | } | ||
| 4862 | |||
| 4863 | /* | ||
| 4864 | * Return 1 if we should exit from L2 to L1 to handle an MSR access access, | ||
| 4865 | * rather than handle it ourselves in L0. I.e., check whether L1 expressed | ||
| 4866 | * disinterest in the current event (read or write a specific MSR) by using an | ||
| 4867 | * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps. | ||
| 4868 | */ | ||
| 4869 | static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu, | ||
| 4870 | struct vmcs12 *vmcs12, u32 exit_reason) | ||
| 4871 | { | ||
| 4872 | u32 msr_index = vcpu->arch.regs[VCPU_REGS_RCX]; | ||
| 4873 | gpa_t bitmap; | ||
| 4874 | |||
| 4875 | if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) | ||
| 4876 | return true; | ||
| 4877 | |||
| 4878 | /* | ||
| 4879 | * The MSR_BITMAP page is divided into four 1024-byte bitmaps, | ||
| 4880 | * for the four combinations of read/write and low/high MSR numbers. | ||
| 4881 | * First we need to figure out which of the four to use: | ||
| 4882 | */ | ||
| 4883 | bitmap = vmcs12->msr_bitmap; | ||
| 4884 | if (exit_reason == EXIT_REASON_MSR_WRITE) | ||
| 4885 | bitmap += 2048; | ||
| 4886 | if (msr_index >= 0xc0000000) { | ||
| 4887 | msr_index -= 0xc0000000; | ||
| 4888 | bitmap += 1024; | ||
| 4889 | } | ||
| 4890 | |||
| 4891 | /* Then read the msr_index'th bit from this bitmap: */ | ||
| 4892 | if (msr_index < 1024*8) { | ||
| 4893 | unsigned char b; | ||
| 4894 | if (kvm_vcpu_read_guest(vcpu, bitmap + msr_index/8, &b, 1)) | ||
| 4895 | return true; | ||
| 4896 | return 1 & (b >> (msr_index & 7)); | ||
| 4897 | } else | ||
| 4898 | return true; /* let L1 handle the wrong parameter */ | ||
| 4899 | } | ||
| 4900 | |||
| 4901 | /* | ||
| 4902 | * Return 1 if we should exit from L2 to L1 to handle a CR access exit, | ||
| 4903 | * rather than handle it ourselves in L0. I.e., check if L1 wanted to | ||
| 4904 | * intercept (via guest_host_mask etc.) the current event. | ||
| 4905 | */ | ||
| 4906 | static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, | ||
| 4907 | struct vmcs12 *vmcs12) | ||
| 4908 | { | ||
| 4909 | unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); | ||
| 4910 | int cr = exit_qualification & 15; | ||
| 4911 | int reg; | ||
| 4912 | unsigned long val; | ||
| 4913 | |||
| 4914 | switch ((exit_qualification >> 4) & 3) { | ||
| 4915 | case 0: /* mov to cr */ | ||
| 4916 | reg = (exit_qualification >> 8) & 15; | ||
| 4917 | val = kvm_register_readl(vcpu, reg); | ||
| 4918 | switch (cr) { | ||
| 4919 | case 0: | ||
| 4920 | if (vmcs12->cr0_guest_host_mask & | ||
| 4921 | (val ^ vmcs12->cr0_read_shadow)) | ||
| 4922 | return true; | ||
| 4923 | break; | ||
| 4924 | case 3: | ||
| 4925 | if ((vmcs12->cr3_target_count >= 1 && | ||
| 4926 | vmcs12->cr3_target_value0 == val) || | ||
| 4927 | (vmcs12->cr3_target_count >= 2 && | ||
| 4928 | vmcs12->cr3_target_value1 == val) || | ||
| 4929 | (vmcs12->cr3_target_count >= 3 && | ||
| 4930 | vmcs12->cr3_target_value2 == val) || | ||
| 4931 | (vmcs12->cr3_target_count >= 4 && | ||
| 4932 | vmcs12->cr3_target_value3 == val)) | ||
| 4933 | return false; | ||
| 4934 | if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING)) | ||
| 4935 | return true; | ||
| 4936 | break; | ||
| 4937 | case 4: | ||
| 4938 | if (vmcs12->cr4_guest_host_mask & | ||
| 4939 | (vmcs12->cr4_read_shadow ^ val)) | ||
| 4940 | return true; | ||
| 4941 | break; | ||
| 4942 | case 8: | ||
| 4943 | if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING)) | ||
| 4944 | return true; | ||
| 4945 | break; | ||
| 4946 | } | ||
| 4947 | break; | ||
| 4948 | case 2: /* clts */ | ||
| 4949 | if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) && | ||
| 4950 | (vmcs12->cr0_read_shadow & X86_CR0_TS)) | ||
| 4951 | return true; | ||
| 4952 | break; | ||
| 4953 | case 1: /* mov from cr */ | ||
| 4954 | switch (cr) { | ||
| 4955 | case 3: | ||
| 4956 | if (vmcs12->cpu_based_vm_exec_control & | ||
| 4957 | CPU_BASED_CR3_STORE_EXITING) | ||
| 4958 | return true; | ||
| 4959 | break; | ||
| 4960 | case 8: | ||
| 4961 | if (vmcs12->cpu_based_vm_exec_control & | ||
| 4962 | CPU_BASED_CR8_STORE_EXITING) | ||
| 4963 | return true; | ||
| 4964 | break; | ||
| 4965 | } | ||
| 4966 | break; | ||
| 4967 | case 3: /* lmsw */ | ||
| 4968 | /* | ||
| 4969 | * lmsw can change bits 1..3 of cr0, and only set bit 0 of | ||
| 4970 | * cr0. Other attempted changes are ignored, with no exit. | ||
| 4971 | */ | ||
| 4972 | val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f; | ||
| 4973 | if (vmcs12->cr0_guest_host_mask & 0xe & | ||
| 4974 | (val ^ vmcs12->cr0_read_shadow)) | ||
| 4975 | return true; | ||
| 4976 | if ((vmcs12->cr0_guest_host_mask & 0x1) && | ||
| 4977 | !(vmcs12->cr0_read_shadow & 0x1) && | ||
| 4978 | (val & 0x1)) | ||
| 4979 | return true; | ||
| 4980 | break; | ||
| 4981 | } | ||
| 4982 | return false; | ||
| 4983 | } | ||
| 4984 | |||
| 4985 | static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu, | ||
| 4986 | struct vmcs12 *vmcs12, gpa_t bitmap) | ||
| 4987 | { | ||
| 4988 | u32 vmx_instruction_info; | ||
| 4989 | unsigned long field; | ||
| 4990 | u8 b; | ||
| 4991 | |||
| 4992 | if (!nested_cpu_has_shadow_vmcs(vmcs12)) | ||
| 4993 | return true; | ||
| 4994 | |||
| 4995 | /* Decode instruction info and find the field to access */ | ||
| 4996 | vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); | ||
| 4997 | field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); | ||
| 4998 | |||
| 4999 | /* Out-of-range fields always cause a VM exit from L2 to L1 */ | ||
| 5000 | if (field >> 15) | ||
| 5001 | return true; | ||
| 5002 | |||
| 5003 | if (kvm_vcpu_read_guest(vcpu, bitmap + field/8, &b, 1)) | ||
| 5004 | return true; | ||
| 5005 | |||
| 5006 | return 1 & (b >> (field & 7)); | ||
| 5007 | } | ||
| 5008 | |||
| 5009 | /* | ||
| 5010 | * Return 1 if we should exit from L2 to L1 to handle an exit, or 0 if we | ||
| 5011 | * should handle it ourselves in L0 (and then continue L2). Only call this | ||
| 5012 | * when in is_guest_mode (L2). | ||
| 5013 | */ | ||
| 5014 | bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason) | ||
| 5015 | { | ||
| 5016 | u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO); | ||
| 5017 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 5018 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); | ||
| 5019 | |||
| 5020 | if (vmx->nested.nested_run_pending) | ||
| 5021 | return false; | ||
| 5022 | |||
| 5023 | if (unlikely(vmx->fail)) { | ||
| 5024 | pr_info_ratelimited("%s failed vm entry %x\n", __func__, | ||
| 5025 | vmcs_read32(VM_INSTRUCTION_ERROR)); | ||
| 5026 | return true; | ||
| 5027 | } | ||
| 5028 | |||
| 5029 | /* | ||
| 5030 | * The host physical addresses of some pages of guest memory | ||
| 5031 | * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC | ||
| 5032 | * Page). The CPU may write to these pages via their host | ||
| 5033 | * physical address while L2 is running, bypassing any | ||
| 5034 | * address-translation-based dirty tracking (e.g. EPT write | ||
| 5035 | * protection). | ||
| 5036 | * | ||
| 5037 | * Mark them dirty on every exit from L2 to prevent them from | ||
| 5038 | * getting out of sync with dirty tracking. | ||
| 5039 | */ | ||
| 5040 | nested_mark_vmcs12_pages_dirty(vcpu); | ||
| 5041 | |||
| 5042 | trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason, | ||
| 5043 | vmcs_readl(EXIT_QUALIFICATION), | ||
| 5044 | vmx->idt_vectoring_info, | ||
| 5045 | intr_info, | ||
| 5046 | vmcs_read32(VM_EXIT_INTR_ERROR_CODE), | ||
| 5047 | KVM_ISA_VMX); | ||
| 5048 | |||
| 5049 | switch (exit_reason) { | ||
| 5050 | case EXIT_REASON_EXCEPTION_NMI: | ||
| 5051 | if (is_nmi(intr_info)) | ||
| 5052 | return false; | ||
| 5053 | else if (is_page_fault(intr_info)) | ||
| 5054 | return !vmx->vcpu.arch.apf.host_apf_reason && enable_ept; | ||
| 5055 | else if (is_debug(intr_info) && | ||
| 5056 | vcpu->guest_debug & | ||
| 5057 | (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) | ||
| 5058 | return false; | ||
| 5059 | else if (is_breakpoint(intr_info) && | ||
| 5060 | vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) | ||
| 5061 | return false; | ||
| 5062 | return vmcs12->exception_bitmap & | ||
| 5063 | (1u << (intr_info & INTR_INFO_VECTOR_MASK)); | ||
| 5064 | case EXIT_REASON_EXTERNAL_INTERRUPT: | ||
| 5065 | return false; | ||
| 5066 | case EXIT_REASON_TRIPLE_FAULT: | ||
| 5067 | return true; | ||
| 5068 | case EXIT_REASON_PENDING_INTERRUPT: | ||
| 5069 | return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_INTR_PENDING); | ||
| 5070 | case EXIT_REASON_NMI_WINDOW: | ||
| 5071 | return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING); | ||
| 5072 | case EXIT_REASON_TASK_SWITCH: | ||
| 5073 | return true; | ||
| 5074 | case EXIT_REASON_CPUID: | ||
| 5075 | return true; | ||
| 5076 | case EXIT_REASON_HLT: | ||
| 5077 | return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING); | ||
| 5078 | case EXIT_REASON_INVD: | ||
| 5079 | return true; | ||
| 5080 | case EXIT_REASON_INVLPG: | ||
| 5081 | return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); | ||
| 5082 | case EXIT_REASON_RDPMC: | ||
| 5083 | return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING); | ||
| 5084 | case EXIT_REASON_RDRAND: | ||
| 5085 | return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND_EXITING); | ||
| 5086 | case EXIT_REASON_RDSEED: | ||
| 5087 | return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING); | ||
| 5088 | case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP: | ||
| 5089 | return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING); | ||
| 5090 | case EXIT_REASON_VMREAD: | ||
| 5091 | return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, | ||
| 5092 | vmcs12->vmread_bitmap); | ||
| 5093 | case EXIT_REASON_VMWRITE: | ||
| 5094 | return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, | ||
| 5095 | vmcs12->vmwrite_bitmap); | ||
| 5096 | case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR: | ||
| 5097 | case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD: | ||
| 5098 | case EXIT_REASON_VMPTRST: case EXIT_REASON_VMRESUME: | ||
| 5099 | case EXIT_REASON_VMOFF: case EXIT_REASON_VMON: | ||
| 5100 | case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID: | ||
| 5101 | /* | ||
| 5102 | * VMX instructions trap unconditionally. This allows L1 to | ||
| 5103 | * emulate them for its L2 guest, i.e., allows 3-level nesting! | ||
| 5104 | */ | ||
| 5105 | return true; | ||
| 5106 | case EXIT_REASON_CR_ACCESS: | ||
| 5107 | return nested_vmx_exit_handled_cr(vcpu, vmcs12); | ||
| 5108 | case EXIT_REASON_DR_ACCESS: | ||
| 5109 | return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING); | ||
| 5110 | case EXIT_REASON_IO_INSTRUCTION: | ||
| 5111 | return nested_vmx_exit_handled_io(vcpu, vmcs12); | ||
| 5112 | case EXIT_REASON_GDTR_IDTR: case EXIT_REASON_LDTR_TR: | ||
| 5113 | return nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC); | ||
| 5114 | case EXIT_REASON_MSR_READ: | ||
| 5115 | case EXIT_REASON_MSR_WRITE: | ||
| 5116 | return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason); | ||
| 5117 | case EXIT_REASON_INVALID_STATE: | ||
| 5118 | return true; | ||
| 5119 | case EXIT_REASON_MWAIT_INSTRUCTION: | ||
| 5120 | return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING); | ||
| 5121 | case EXIT_REASON_MONITOR_TRAP_FLAG: | ||
| 5122 | return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_TRAP_FLAG); | ||
| 5123 | case EXIT_REASON_MONITOR_INSTRUCTION: | ||
| 5124 | return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING); | ||
| 5125 | case EXIT_REASON_PAUSE_INSTRUCTION: | ||
| 5126 | return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) || | ||
| 5127 | nested_cpu_has2(vmcs12, | ||
| 5128 | SECONDARY_EXEC_PAUSE_LOOP_EXITING); | ||
| 5129 | case EXIT_REASON_MCE_DURING_VMENTRY: | ||
| 5130 | return false; | ||
| 5131 | case EXIT_REASON_TPR_BELOW_THRESHOLD: | ||
| 5132 | return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW); | ||
| 5133 | case EXIT_REASON_APIC_ACCESS: | ||
| 5134 | case EXIT_REASON_APIC_WRITE: | ||
| 5135 | case EXIT_REASON_EOI_INDUCED: | ||
| 5136 | /* | ||
| 5137 | * The controls for "virtualize APIC accesses," "APIC- | ||
| 5138 | * register virtualization," and "virtual-interrupt | ||
| 5139 | * delivery" only come from vmcs12. | ||
| 5140 | */ | ||
| 5141 | return true; | ||
| 5142 | case EXIT_REASON_EPT_VIOLATION: | ||
| 5143 | /* | ||
| 5144 | * L0 always deals with the EPT violation. If nested EPT is | ||
| 5145 | * used, and the nested mmu code discovers that the address is | ||
| 5146 | * missing in the guest EPT table (EPT12), the EPT violation | ||
| 5147 | * will be injected with nested_ept_inject_page_fault() | ||
| 5148 | */ | ||
| 5149 | return false; | ||
| 5150 | case EXIT_REASON_EPT_MISCONFIG: | ||
| 5151 | /* | ||
| 5152 | * L2 never uses directly L1's EPT, but rather L0's own EPT | ||
| 5153 | * table (shadow on EPT) or a merged EPT table that L0 built | ||
| 5154 | * (EPT on EPT). So any problems with the structure of the | ||
| 5155 | * table is L0's fault. | ||
| 5156 | */ | ||
| 5157 | return false; | ||
| 5158 | case EXIT_REASON_INVPCID: | ||
| 5159 | return | ||
| 5160 | nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_INVPCID) && | ||
| 5161 | nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); | ||
| 5162 | case EXIT_REASON_WBINVD: | ||
| 5163 | return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING); | ||
| 5164 | case EXIT_REASON_XSETBV: | ||
| 5165 | return true; | ||
| 5166 | case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS: | ||
| 5167 | /* | ||
| 5168 | * This should never happen, since it is not possible to | ||
| 5169 | * set XSS to a non-zero value---neither in L1 nor in L2. | ||
| 5170 | * If if it were, XSS would have to be checked against | ||
| 5171 | * the XSS exit bitmap in vmcs12. | ||
| 5172 | */ | ||
| 5173 | return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES); | ||
| 5174 | case EXIT_REASON_PREEMPTION_TIMER: | ||
| 5175 | return false; | ||
| 5176 | case EXIT_REASON_PML_FULL: | ||
| 5177 | /* We emulate PML support to L1. */ | ||
| 5178 | return false; | ||
| 5179 | case EXIT_REASON_VMFUNC: | ||
| 5180 | /* VM functions are emulated through L2->L0 vmexits. */ | ||
| 5181 | return false; | ||
| 5182 | case EXIT_REASON_ENCLS: | ||
| 5183 | /* SGX is never exposed to L1 */ | ||
| 5184 | return false; | ||
| 5185 | default: | ||
| 5186 | return true; | ||
| 5187 | } | ||
| 5188 | } | ||
| 5189 | |||
| 5190 | |||
| 5191 | static int vmx_get_nested_state(struct kvm_vcpu *vcpu, | ||
| 5192 | struct kvm_nested_state __user *user_kvm_nested_state, | ||
| 5193 | u32 user_data_size) | ||
| 5194 | { | ||
| 5195 | struct vcpu_vmx *vmx; | ||
| 5196 | struct vmcs12 *vmcs12; | ||
| 5197 | struct kvm_nested_state kvm_state = { | ||
| 5198 | .flags = 0, | ||
| 5199 | .format = 0, | ||
| 5200 | .size = sizeof(kvm_state), | ||
| 5201 | .vmx.vmxon_pa = -1ull, | ||
| 5202 | .vmx.vmcs_pa = -1ull, | ||
| 5203 | }; | ||
| 5204 | |||
| 5205 | if (!vcpu) | ||
| 5206 | return kvm_state.size + 2 * VMCS12_SIZE; | ||
| 5207 | |||
| 5208 | vmx = to_vmx(vcpu); | ||
| 5209 | vmcs12 = get_vmcs12(vcpu); | ||
| 5210 | |||
| 5211 | if (nested_vmx_allowed(vcpu) && vmx->nested.enlightened_vmcs_enabled) | ||
| 5212 | kvm_state.flags |= KVM_STATE_NESTED_EVMCS; | ||
| 5213 | |||
| 5214 | if (nested_vmx_allowed(vcpu) && | ||
| 5215 | (vmx->nested.vmxon || vmx->nested.smm.vmxon)) { | ||
| 5216 | kvm_state.vmx.vmxon_pa = vmx->nested.vmxon_ptr; | ||
| 5217 | kvm_state.vmx.vmcs_pa = vmx->nested.current_vmptr; | ||
| 5218 | |||
| 5219 | if (vmx_has_valid_vmcs12(vcpu)) { | ||
| 5220 | kvm_state.size += VMCS12_SIZE; | ||
| 5221 | |||
| 5222 | if (is_guest_mode(vcpu) && | ||
| 5223 | nested_cpu_has_shadow_vmcs(vmcs12) && | ||
| 5224 | vmcs12->vmcs_link_pointer != -1ull) | ||
| 5225 | kvm_state.size += VMCS12_SIZE; | ||
| 5226 | } | ||
| 5227 | |||
| 5228 | if (vmx->nested.smm.vmxon) | ||
| 5229 | kvm_state.vmx.smm.flags |= KVM_STATE_NESTED_SMM_VMXON; | ||
| 5230 | |||
| 5231 | if (vmx->nested.smm.guest_mode) | ||
| 5232 | kvm_state.vmx.smm.flags |= KVM_STATE_NESTED_SMM_GUEST_MODE; | ||
| 5233 | |||
| 5234 | if (is_guest_mode(vcpu)) { | ||
| 5235 | kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE; | ||
| 5236 | |||
| 5237 | if (vmx->nested.nested_run_pending) | ||
| 5238 | kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING; | ||
| 5239 | } | ||
| 5240 | } | ||
| 5241 | |||
| 5242 | if (user_data_size < kvm_state.size) | ||
| 5243 | goto out; | ||
| 5244 | |||
| 5245 | if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state))) | ||
| 5246 | return -EFAULT; | ||
| 5247 | |||
| 5248 | if (!vmx_has_valid_vmcs12(vcpu)) | ||
| 5249 | goto out; | ||
| 5250 | |||
| 5251 | /* | ||
| 5252 | * When running L2, the authoritative vmcs12 state is in the | ||
| 5253 | * vmcs02. When running L1, the authoritative vmcs12 state is | ||
| 5254 | * in the shadow or enlightened vmcs linked to vmcs01, unless | ||
| 5255 | * need_vmcs12_sync is set, in which case, the authoritative | ||
| 5256 | * vmcs12 state is in the vmcs12 already. | ||
| 5257 | */ | ||
| 5258 | if (is_guest_mode(vcpu)) { | ||
| 5259 | sync_vmcs12(vcpu, vmcs12); | ||
| 5260 | } else if (!vmx->nested.need_vmcs12_sync) { | ||
| 5261 | if (vmx->nested.hv_evmcs) | ||
| 5262 | copy_enlightened_to_vmcs12(vmx); | ||
| 5263 | else if (enable_shadow_vmcs) | ||
| 5264 | copy_shadow_to_vmcs12(vmx); | ||
| 5265 | } | ||
| 5266 | |||
| 5267 | if (copy_to_user(user_kvm_nested_state->data, vmcs12, sizeof(*vmcs12))) | ||
| 5268 | return -EFAULT; | ||
| 5269 | |||
| 5270 | if (nested_cpu_has_shadow_vmcs(vmcs12) && | ||
| 5271 | vmcs12->vmcs_link_pointer != -1ull) { | ||
| 5272 | if (copy_to_user(user_kvm_nested_state->data + VMCS12_SIZE, | ||
| 5273 | get_shadow_vmcs12(vcpu), sizeof(*vmcs12))) | ||
| 5274 | return -EFAULT; | ||
| 5275 | } | ||
| 5276 | |||
| 5277 | out: | ||
| 5278 | return kvm_state.size; | ||
| 5279 | } | ||
| 5280 | |||
| 5281 | /* | ||
| 5282 | * Forcibly leave nested mode in order to be able to reset the VCPU later on. | ||
| 5283 | */ | ||
| 5284 | void vmx_leave_nested(struct kvm_vcpu *vcpu) | ||
| 5285 | { | ||
| 5286 | if (is_guest_mode(vcpu)) { | ||
| 5287 | to_vmx(vcpu)->nested.nested_run_pending = 0; | ||
| 5288 | nested_vmx_vmexit(vcpu, -1, 0, 0); | ||
| 5289 | } | ||
| 5290 | free_nested(vcpu); | ||
| 5291 | } | ||
| 5292 | |||
| 5293 | static int vmx_set_nested_state(struct kvm_vcpu *vcpu, | ||
| 5294 | struct kvm_nested_state __user *user_kvm_nested_state, | ||
| 5295 | struct kvm_nested_state *kvm_state) | ||
| 5296 | { | ||
| 5297 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 5298 | struct vmcs12 *vmcs12; | ||
| 5299 | u32 exit_qual; | ||
| 5300 | int ret; | ||
| 5301 | |||
| 5302 | if (kvm_state->format != 0) | ||
| 5303 | return -EINVAL; | ||
| 5304 | |||
| 5305 | if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) | ||
| 5306 | nested_enable_evmcs(vcpu, NULL); | ||
| 5307 | |||
| 5308 | if (!nested_vmx_allowed(vcpu)) | ||
| 5309 | return kvm_state->vmx.vmxon_pa == -1ull ? 0 : -EINVAL; | ||
| 5310 | |||
| 5311 | if (kvm_state->vmx.vmxon_pa == -1ull) { | ||
| 5312 | if (kvm_state->vmx.smm.flags) | ||
| 5313 | return -EINVAL; | ||
| 5314 | |||
| 5315 | if (kvm_state->vmx.vmcs_pa != -1ull) | ||
| 5316 | return -EINVAL; | ||
| 5317 | |||
| 5318 | vmx_leave_nested(vcpu); | ||
| 5319 | return 0; | ||
| 5320 | } | ||
| 5321 | |||
| 5322 | if (!page_address_valid(vcpu, kvm_state->vmx.vmxon_pa)) | ||
| 5323 | return -EINVAL; | ||
| 5324 | |||
| 5325 | if ((kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && | ||
| 5326 | (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) | ||
| 5327 | return -EINVAL; | ||
| 5328 | |||
| 5329 | if (kvm_state->vmx.smm.flags & | ||
| 5330 | ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON)) | ||
| 5331 | return -EINVAL; | ||
| 5332 | |||
| 5333 | /* | ||
| 5334 | * SMM temporarily disables VMX, so we cannot be in guest mode, | ||
| 5335 | * nor can VMLAUNCH/VMRESUME be pending. Outside SMM, SMM flags | ||
| 5336 | * must be zero. | ||
| 5337 | */ | ||
| 5338 | if (is_smm(vcpu) ? kvm_state->flags : kvm_state->vmx.smm.flags) | ||
| 5339 | return -EINVAL; | ||
| 5340 | |||
| 5341 | if ((kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && | ||
| 5342 | !(kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON)) | ||
| 5343 | return -EINVAL; | ||
| 5344 | |||
| 5345 | vmx_leave_nested(vcpu); | ||
| 5346 | if (kvm_state->vmx.vmxon_pa == -1ull) | ||
| 5347 | return 0; | ||
| 5348 | |||
| 5349 | vmx->nested.vmxon_ptr = kvm_state->vmx.vmxon_pa; | ||
| 5350 | ret = enter_vmx_operation(vcpu); | ||
| 5351 | if (ret) | ||
| 5352 | return ret; | ||
| 5353 | |||
| 5354 | /* Empty 'VMXON' state is permitted */ | ||
| 5355 | if (kvm_state->size < sizeof(kvm_state) + sizeof(*vmcs12)) | ||
| 5356 | return 0; | ||
| 5357 | |||
| 5358 | if (kvm_state->vmx.vmcs_pa != -1ull) { | ||
| 5359 | if (kvm_state->vmx.vmcs_pa == kvm_state->vmx.vmxon_pa || | ||
| 5360 | !page_address_valid(vcpu, kvm_state->vmx.vmcs_pa)) | ||
| 5361 | return -EINVAL; | ||
| 5362 | |||
| 5363 | set_current_vmptr(vmx, kvm_state->vmx.vmcs_pa); | ||
| 5364 | } else if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) { | ||
| 5365 | /* | ||
| 5366 | * Sync eVMCS upon entry as we may not have | ||
| 5367 | * HV_X64_MSR_VP_ASSIST_PAGE set up yet. | ||
| 5368 | */ | ||
| 5369 | vmx->nested.need_vmcs12_sync = true; | ||
| 5370 | } else { | ||
| 5371 | return -EINVAL; | ||
| 5372 | } | ||
| 5373 | |||
| 5374 | if (kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON) { | ||
| 5375 | vmx->nested.smm.vmxon = true; | ||
| 5376 | vmx->nested.vmxon = false; | ||
| 5377 | |||
| 5378 | if (kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) | ||
| 5379 | vmx->nested.smm.guest_mode = true; | ||
| 5380 | } | ||
| 5381 | |||
| 5382 | vmcs12 = get_vmcs12(vcpu); | ||
| 5383 | if (copy_from_user(vmcs12, user_kvm_nested_state->data, sizeof(*vmcs12))) | ||
| 5384 | return -EFAULT; | ||
| 5385 | |||
| 5386 | if (vmcs12->hdr.revision_id != VMCS12_REVISION) | ||
| 5387 | return -EINVAL; | ||
| 5388 | |||
| 5389 | if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) | ||
| 5390 | return 0; | ||
| 5391 | |||
| 5392 | vmx->nested.nested_run_pending = | ||
| 5393 | !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING); | ||
| 5394 | |||
| 5395 | if (nested_cpu_has_shadow_vmcs(vmcs12) && | ||
| 5396 | vmcs12->vmcs_link_pointer != -1ull) { | ||
| 5397 | struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu); | ||
| 5398 | |||
| 5399 | if (kvm_state->size < sizeof(kvm_state) + 2 * sizeof(*vmcs12)) | ||
| 5400 | return -EINVAL; | ||
| 5401 | |||
| 5402 | if (copy_from_user(shadow_vmcs12, | ||
| 5403 | user_kvm_nested_state->data + VMCS12_SIZE, | ||
| 5404 | sizeof(*vmcs12))) | ||
| 5405 | return -EFAULT; | ||
| 5406 | |||
| 5407 | if (shadow_vmcs12->hdr.revision_id != VMCS12_REVISION || | ||
| 5408 | !shadow_vmcs12->hdr.shadow_vmcs) | ||
| 5409 | return -EINVAL; | ||
| 5410 | } | ||
| 5411 | |||
| 5412 | if (nested_vmx_check_vmentry_prereqs(vcpu, vmcs12) || | ||
| 5413 | nested_vmx_check_vmentry_postreqs(vcpu, vmcs12, &exit_qual)) | ||
| 5414 | return -EINVAL; | ||
| 5415 | |||
| 5416 | vmx->nested.dirty_vmcs12 = true; | ||
| 5417 | ret = nested_vmx_enter_non_root_mode(vcpu, false); | ||
| 5418 | if (ret) | ||
| 5419 | return -EINVAL; | ||
| 5420 | |||
| 5421 | return 0; | ||
| 5422 | } | ||
| 5423 | |||
| 5424 | void nested_vmx_vcpu_setup(void) | ||
| 5425 | { | ||
| 5426 | if (enable_shadow_vmcs) { | ||
| 5427 | /* | ||
| 5428 | * At vCPU creation, "VMWRITE to any supported field | ||
| 5429 | * in the VMCS" is supported, so use the more | ||
| 5430 | * permissive vmx_vmread_bitmap to specify both read | ||
| 5431 | * and write permissions for the shadow VMCS. | ||
| 5432 | */ | ||
| 5433 | vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap)); | ||
| 5434 | vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmread_bitmap)); | ||
| 5435 | } | ||
| 5436 | } | ||
| 5437 | |||
| 5438 | /* | ||
| 5439 | * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be | ||
| 5440 | * returned for the various VMX controls MSRs when nested VMX is enabled. | ||
| 5441 | * The same values should also be used to verify that vmcs12 control fields are | ||
| 5442 | * valid during nested entry from L1 to L2. | ||
| 5443 | * Each of these control msrs has a low and high 32-bit half: A low bit is on | ||
| 5444 | * if the corresponding bit in the (32-bit) control field *must* be on, and a | ||
| 5445 | * bit in the high half is on if the corresponding bit in the control field | ||
| 5446 | * may be on. See also vmx_control_verify(). | ||
| 5447 | */ | ||
| 5448 | void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps, | ||
| 5449 | bool apicv) | ||
| 5450 | { | ||
| 5451 | /* | ||
| 5452 | * Note that as a general rule, the high half of the MSRs (bits in | ||
| 5453 | * the control fields which may be 1) should be initialized by the | ||
| 5454 | * intersection of the underlying hardware's MSR (i.e., features which | ||
| 5455 | * can be supported) and the list of features we want to expose - | ||
| 5456 | * because they are known to be properly supported in our code. | ||
| 5457 | * Also, usually, the low half of the MSRs (bits which must be 1) can | ||
| 5458 | * be set to 0, meaning that L1 may turn off any of these bits. The | ||
| 5459 | * reason is that if one of these bits is necessary, it will appear | ||
| 5460 | * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control | ||
| 5461 | * fields of vmcs01 and vmcs02, will turn these bits off - and | ||
| 5462 | * nested_vmx_exit_reflected() will not pass related exits to L1. | ||
| 5463 | * These rules have exceptions below. | ||
| 5464 | */ | ||
| 5465 | |||
| 5466 | /* pin-based controls */ | ||
| 5467 | rdmsr(MSR_IA32_VMX_PINBASED_CTLS, | ||
| 5468 | msrs->pinbased_ctls_low, | ||
| 5469 | msrs->pinbased_ctls_high); | ||
| 5470 | msrs->pinbased_ctls_low |= | ||
| 5471 | PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; | ||
| 5472 | msrs->pinbased_ctls_high &= | ||
| 5473 | PIN_BASED_EXT_INTR_MASK | | ||
| 5474 | PIN_BASED_NMI_EXITING | | ||
| 5475 | PIN_BASED_VIRTUAL_NMIS | | ||
| 5476 | (apicv ? PIN_BASED_POSTED_INTR : 0); | ||
| 5477 | msrs->pinbased_ctls_high |= | ||
| 5478 | PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | | ||
| 5479 | PIN_BASED_VMX_PREEMPTION_TIMER; | ||
| 5480 | |||
| 5481 | /* exit controls */ | ||
| 5482 | rdmsr(MSR_IA32_VMX_EXIT_CTLS, | ||
| 5483 | msrs->exit_ctls_low, | ||
| 5484 | msrs->exit_ctls_high); | ||
| 5485 | msrs->exit_ctls_low = | ||
| 5486 | VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; | ||
| 5487 | |||
| 5488 | msrs->exit_ctls_high &= | ||
| 5489 | #ifdef CONFIG_X86_64 | ||
| 5490 | VM_EXIT_HOST_ADDR_SPACE_SIZE | | ||
| 5491 | #endif | ||
| 5492 | VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT; | ||
| 5493 | msrs->exit_ctls_high |= | ||
| 5494 | VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | | ||
| 5495 | VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER | | ||
| 5496 | VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT; | ||
| 5497 | |||
| 5498 | /* We support free control of debug control saving. */ | ||
| 5499 | msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS; | ||
| 5500 | |||
| 5501 | /* entry controls */ | ||
| 5502 | rdmsr(MSR_IA32_VMX_ENTRY_CTLS, | ||
| 5503 | msrs->entry_ctls_low, | ||
| 5504 | msrs->entry_ctls_high); | ||
| 5505 | msrs->entry_ctls_low = | ||
| 5506 | VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; | ||
| 5507 | msrs->entry_ctls_high &= | ||
| 5508 | #ifdef CONFIG_X86_64 | ||
| 5509 | VM_ENTRY_IA32E_MODE | | ||
| 5510 | #endif | ||
| 5511 | VM_ENTRY_LOAD_IA32_PAT; | ||
| 5512 | msrs->entry_ctls_high |= | ||
| 5513 | (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER); | ||
| 5514 | |||
| 5515 | /* We support free control of debug control loading. */ | ||
| 5516 | msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS; | ||
| 5517 | |||
| 5518 | /* cpu-based controls */ | ||
| 5519 | rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, | ||
| 5520 | msrs->procbased_ctls_low, | ||
| 5521 | msrs->procbased_ctls_high); | ||
| 5522 | msrs->procbased_ctls_low = | ||
| 5523 | CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; | ||
| 5524 | msrs->procbased_ctls_high &= | ||
| 5525 | CPU_BASED_VIRTUAL_INTR_PENDING | | ||
| 5526 | CPU_BASED_VIRTUAL_NMI_PENDING | CPU_BASED_USE_TSC_OFFSETING | | ||
| 5527 | CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING | | ||
| 5528 | CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING | | ||
| 5529 | CPU_BASED_CR3_STORE_EXITING | | ||
| 5530 | #ifdef CONFIG_X86_64 | ||
| 5531 | CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING | | ||
| 5532 | #endif | ||
| 5533 | CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING | | ||
| 5534 | CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_TRAP_FLAG | | ||
| 5535 | CPU_BASED_MONITOR_EXITING | CPU_BASED_RDPMC_EXITING | | ||
| 5536 | CPU_BASED_RDTSC_EXITING | CPU_BASED_PAUSE_EXITING | | ||
| 5537 | CPU_BASED_TPR_SHADOW | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; | ||
| 5538 | /* | ||
| 5539 | * We can allow some features even when not supported by the | ||
| 5540 | * hardware. For example, L1 can specify an MSR bitmap - and we | ||
| 5541 | * can use it to avoid exits to L1 - even when L0 runs L2 | ||
| 5542 | * without MSR bitmaps. | ||
| 5543 | */ | ||
| 5544 | msrs->procbased_ctls_high |= | ||
| 5545 | CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR | | ||
| 5546 | CPU_BASED_USE_MSR_BITMAPS; | ||
| 5547 | |||
| 5548 | /* We support free control of CR3 access interception. */ | ||
| 5549 | msrs->procbased_ctls_low &= | ||
| 5550 | ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING); | ||
| 5551 | |||
| 5552 | /* | ||
| 5553 | * secondary cpu-based controls. Do not include those that | ||
| 5554 | * depend on CPUID bits, they are added later by vmx_cpuid_update. | ||
| 5555 | */ | ||
| 5556 | rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2, | ||
| 5557 | msrs->secondary_ctls_low, | ||
| 5558 | msrs->secondary_ctls_high); | ||
| 5559 | msrs->secondary_ctls_low = 0; | ||
| 5560 | msrs->secondary_ctls_high &= | ||
| 5561 | SECONDARY_EXEC_DESC | | ||
| 5562 | SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | | ||
| 5563 | SECONDARY_EXEC_APIC_REGISTER_VIRT | | ||
| 5564 | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | | ||
| 5565 | SECONDARY_EXEC_WBINVD_EXITING; | ||
| 5566 | |||
| 5567 | /* | ||
| 5568 | * We can emulate "VMCS shadowing," even if the hardware | ||
| 5569 | * doesn't support it. | ||
| 5570 | */ | ||
| 5571 | msrs->secondary_ctls_high |= | ||
| 5572 | SECONDARY_EXEC_SHADOW_VMCS; | ||
| 5573 | |||
| 5574 | if (enable_ept) { | ||
| 5575 | /* nested EPT: emulate EPT also to L1 */ | ||
| 5576 | msrs->secondary_ctls_high |= | ||
| 5577 | SECONDARY_EXEC_ENABLE_EPT; | ||
| 5578 | msrs->ept_caps = VMX_EPT_PAGE_WALK_4_BIT | | ||
| 5579 | VMX_EPTP_WB_BIT | VMX_EPT_INVEPT_BIT; | ||
| 5580 | if (cpu_has_vmx_ept_execute_only()) | ||
| 5581 | msrs->ept_caps |= | ||
| 5582 | VMX_EPT_EXECUTE_ONLY_BIT; | ||
| 5583 | msrs->ept_caps &= ept_caps; | ||
| 5584 | msrs->ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT | | ||
| 5585 | VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT | | ||
| 5586 | VMX_EPT_1GB_PAGE_BIT; | ||
| 5587 | if (enable_ept_ad_bits) { | ||
| 5588 | msrs->secondary_ctls_high |= | ||
| 5589 | SECONDARY_EXEC_ENABLE_PML; | ||
| 5590 | msrs->ept_caps |= VMX_EPT_AD_BIT; | ||
| 5591 | } | ||
| 5592 | } | ||
| 5593 | |||
| 5594 | if (cpu_has_vmx_vmfunc()) { | ||
| 5595 | msrs->secondary_ctls_high |= | ||
| 5596 | SECONDARY_EXEC_ENABLE_VMFUNC; | ||
| 5597 | /* | ||
| 5598 | * Advertise EPTP switching unconditionally | ||
| 5599 | * since we emulate it | ||
| 5600 | */ | ||
| 5601 | if (enable_ept) | ||
| 5602 | msrs->vmfunc_controls = | ||
| 5603 | VMX_VMFUNC_EPTP_SWITCHING; | ||
| 5604 | } | ||
| 5605 | |||
| 5606 | /* | ||
| 5607 | * Old versions of KVM use the single-context version without | ||
| 5608 | * checking for support, so declare that it is supported even | ||
| 5609 | * though it is treated as global context. The alternative is | ||
| 5610 | * not failing the single-context invvpid, and it is worse. | ||
| 5611 | */ | ||
| 5612 | if (enable_vpid) { | ||
| 5613 | msrs->secondary_ctls_high |= | ||
| 5614 | SECONDARY_EXEC_ENABLE_VPID; | ||
| 5615 | msrs->vpid_caps = VMX_VPID_INVVPID_BIT | | ||
| 5616 | VMX_VPID_EXTENT_SUPPORTED_MASK; | ||
| 5617 | } | ||
| 5618 | |||
| 5619 | if (enable_unrestricted_guest) | ||
| 5620 | msrs->secondary_ctls_high |= | ||
| 5621 | SECONDARY_EXEC_UNRESTRICTED_GUEST; | ||
| 5622 | |||
| 5623 | if (flexpriority_enabled) | ||
| 5624 | msrs->secondary_ctls_high |= | ||
| 5625 | SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; | ||
| 5626 | |||
| 5627 | /* miscellaneous data */ | ||
| 5628 | rdmsr(MSR_IA32_VMX_MISC, | ||
| 5629 | msrs->misc_low, | ||
| 5630 | msrs->misc_high); | ||
| 5631 | msrs->misc_low &= VMX_MISC_SAVE_EFER_LMA; | ||
| 5632 | msrs->misc_low |= | ||
| 5633 | MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS | | ||
| 5634 | VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE | | ||
| 5635 | VMX_MISC_ACTIVITY_HLT; | ||
| 5636 | msrs->misc_high = 0; | ||
| 5637 | |||
| 5638 | /* | ||
| 5639 | * This MSR reports some information about VMX support. We | ||
| 5640 | * should return information about the VMX we emulate for the | ||
| 5641 | * guest, and the VMCS structure we give it - not about the | ||
| 5642 | * VMX support of the underlying hardware. | ||
| 5643 | */ | ||
| 5644 | msrs->basic = | ||
| 5645 | VMCS12_REVISION | | ||
| 5646 | VMX_BASIC_TRUE_CTLS | | ||
| 5647 | ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) | | ||
| 5648 | (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT); | ||
| 5649 | |||
| 5650 | if (cpu_has_vmx_basic_inout()) | ||
| 5651 | msrs->basic |= VMX_BASIC_INOUT; | ||
| 5652 | |||
| 5653 | /* | ||
| 5654 | * These MSRs specify bits which the guest must keep fixed on | ||
| 5655 | * while L1 is in VMXON mode (in L1's root mode, or running an L2). | ||
| 5656 | * We picked the standard core2 setting. | ||
| 5657 | */ | ||
| 5658 | #define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE) | ||
| 5659 | #define VMXON_CR4_ALWAYSON X86_CR4_VMXE | ||
| 5660 | msrs->cr0_fixed0 = VMXON_CR0_ALWAYSON; | ||
| 5661 | msrs->cr4_fixed0 = VMXON_CR4_ALWAYSON; | ||
| 5662 | |||
| 5663 | /* These MSRs specify bits which the guest must keep fixed off. */ | ||
| 5664 | rdmsrl(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1); | ||
| 5665 | rdmsrl(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1); | ||
| 5666 | |||
| 5667 | /* highest index: VMX_PREEMPTION_TIMER_VALUE */ | ||
| 5668 | msrs->vmcs_enum = VMCS12_MAX_FIELD_INDEX << 1; | ||
| 5669 | } | ||
| 5670 | |||
| 5671 | void nested_vmx_hardware_unsetup(void) | ||
| 5672 | { | ||
| 5673 | int i; | ||
| 5674 | |||
| 5675 | if (enable_shadow_vmcs) { | ||
| 5676 | for (i = 0; i < VMX_BITMAP_NR; i++) | ||
| 5677 | free_page((unsigned long)vmx_bitmap[i]); | ||
| 5678 | } | ||
| 5679 | } | ||
| 5680 | |||
| 5681 | __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)) | ||
| 5682 | { | ||
| 5683 | int i; | ||
| 5684 | |||
| 5685 | if (!cpu_has_vmx_shadow_vmcs()) | ||
| 5686 | enable_shadow_vmcs = 0; | ||
| 5687 | if (enable_shadow_vmcs) { | ||
| 5688 | for (i = 0; i < VMX_BITMAP_NR; i++) { | ||
| 5689 | vmx_bitmap[i] = (unsigned long *) | ||
| 5690 | __get_free_page(GFP_KERNEL); | ||
| 5691 | if (!vmx_bitmap[i]) { | ||
| 5692 | nested_vmx_hardware_unsetup(); | ||
| 5693 | return -ENOMEM; | ||
| 5694 | } | ||
| 5695 | } | ||
| 5696 | |||
| 5697 | init_vmcs_shadow_fields(); | ||
| 5698 | } | ||
| 5699 | |||
| 5700 | exit_handlers[EXIT_REASON_VMCLEAR] = handle_vmclear, | ||
| 5701 | exit_handlers[EXIT_REASON_VMLAUNCH] = handle_vmlaunch, | ||
| 5702 | exit_handlers[EXIT_REASON_VMPTRLD] = handle_vmptrld, | ||
| 5703 | exit_handlers[EXIT_REASON_VMPTRST] = handle_vmptrst, | ||
| 5704 | exit_handlers[EXIT_REASON_VMREAD] = handle_vmread, | ||
| 5705 | exit_handlers[EXIT_REASON_VMRESUME] = handle_vmresume, | ||
| 5706 | exit_handlers[EXIT_REASON_VMWRITE] = handle_vmwrite, | ||
| 5707 | exit_handlers[EXIT_REASON_VMOFF] = handle_vmoff, | ||
| 5708 | exit_handlers[EXIT_REASON_VMON] = handle_vmon, | ||
| 5709 | exit_handlers[EXIT_REASON_INVEPT] = handle_invept, | ||
| 5710 | exit_handlers[EXIT_REASON_INVVPID] = handle_invvpid, | ||
| 5711 | exit_handlers[EXIT_REASON_VMFUNC] = handle_vmfunc, | ||
| 5712 | |||
| 5713 | kvm_x86_ops->check_nested_events = vmx_check_nested_events; | ||
| 5714 | kvm_x86_ops->get_nested_state = vmx_get_nested_state; | ||
| 5715 | kvm_x86_ops->set_nested_state = vmx_set_nested_state; | ||
| 5716 | kvm_x86_ops->get_vmcs12_pages = nested_get_vmcs12_pages, | ||
| 5717 | kvm_x86_ops->nested_enable_evmcs = nested_enable_evmcs; | ||
| 5718 | kvm_x86_ops->nested_get_evmcs_version = nested_get_evmcs_version; | ||
| 5719 | |||
| 5720 | return 0; | ||
| 5721 | } | ||
diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h new file mode 100644 index 000000000000..e847ff1019a2 --- /dev/null +++ b/arch/x86/kvm/vmx/nested.h | |||
| @@ -0,0 +1,282 @@ | |||
| 1 | /* SPDX-License-Identifier: GPL-2.0 */ | ||
| 2 | #ifndef __KVM_X86_VMX_NESTED_H | ||
| 3 | #define __KVM_X86_VMX_NESTED_H | ||
| 4 | |||
| 5 | #include "kvm_cache_regs.h" | ||
| 6 | #include "vmcs12.h" | ||
| 7 | #include "vmx.h" | ||
| 8 | |||
| 9 | void vmx_leave_nested(struct kvm_vcpu *vcpu); | ||
| 10 | void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps, | ||
| 11 | bool apicv); | ||
| 12 | void nested_vmx_hardware_unsetup(void); | ||
| 13 | __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)); | ||
| 14 | void nested_vmx_vcpu_setup(void); | ||
| 15 | void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu); | ||
| 16 | int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry); | ||
| 17 | bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason); | ||
| 18 | void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, | ||
| 19 | u32 exit_intr_info, unsigned long exit_qualification); | ||
| 20 | void nested_sync_from_vmcs12(struct kvm_vcpu *vcpu); | ||
| 21 | int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); | ||
| 22 | int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata); | ||
| 23 | int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, | ||
| 24 | u32 vmx_instruction_info, bool wr, gva_t *ret); | ||
| 25 | |||
| 26 | static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu) | ||
| 27 | { | ||
| 28 | return to_vmx(vcpu)->nested.cached_vmcs12; | ||
| 29 | } | ||
| 30 | |||
| 31 | static inline struct vmcs12 *get_shadow_vmcs12(struct kvm_vcpu *vcpu) | ||
| 32 | { | ||
| 33 | return to_vmx(vcpu)->nested.cached_shadow_vmcs12; | ||
| 34 | } | ||
| 35 | |||
| 36 | static inline int vmx_has_valid_vmcs12(struct kvm_vcpu *vcpu) | ||
| 37 | { | ||
| 38 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 39 | |||
| 40 | /* | ||
| 41 | * In case we do two consecutive get/set_nested_state()s while L2 was | ||
| 42 | * running hv_evmcs may end up not being mapped (we map it from | ||
| 43 | * nested_vmx_run()/vmx_vcpu_run()). Check is_guest_mode() as we always | ||
| 44 | * have vmcs12 if it is true. | ||
| 45 | */ | ||
| 46 | return is_guest_mode(vcpu) || vmx->nested.current_vmptr != -1ull || | ||
| 47 | vmx->nested.hv_evmcs; | ||
| 48 | } | ||
| 49 | |||
| 50 | static inline unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu) | ||
| 51 | { | ||
| 52 | /* return the page table to be shadowed - in our case, EPT12 */ | ||
| 53 | return get_vmcs12(vcpu)->ept_pointer; | ||
| 54 | } | ||
| 55 | |||
| 56 | static inline bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu) | ||
| 57 | { | ||
| 58 | return nested_ept_get_cr3(vcpu) & VMX_EPTP_AD_ENABLE_BIT; | ||
| 59 | } | ||
| 60 | |||
| 61 | /* | ||
| 62 | * Reflect a VM Exit into L1. | ||
| 63 | */ | ||
| 64 | static inline int nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu, | ||
| 65 | u32 exit_reason) | ||
| 66 | { | ||
| 67 | u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); | ||
| 68 | |||
| 69 | /* | ||
| 70 | * At this point, the exit interruption info in exit_intr_info | ||
| 71 | * is only valid for EXCEPTION_NMI exits. For EXTERNAL_INTERRUPT | ||
| 72 | * we need to query the in-kernel LAPIC. | ||
| 73 | */ | ||
| 74 | WARN_ON(exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT); | ||
| 75 | if ((exit_intr_info & | ||
| 76 | (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) == | ||
| 77 | (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) { | ||
| 78 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); | ||
| 79 | |||
| 80 | vmcs12->vm_exit_intr_error_code = | ||
| 81 | vmcs_read32(VM_EXIT_INTR_ERROR_CODE); | ||
| 82 | } | ||
| 83 | |||
| 84 | nested_vmx_vmexit(vcpu, exit_reason, exit_intr_info, | ||
| 85 | vmcs_readl(EXIT_QUALIFICATION)); | ||
| 86 | return 1; | ||
| 87 | } | ||
| 88 | |||
| 89 | /* | ||
| 90 | * Return the cr0 value that a nested guest would read. This is a combination | ||
| 91 | * of the real cr0 used to run the guest (guest_cr0), and the bits shadowed by | ||
| 92 | * its hypervisor (cr0_read_shadow). | ||
| 93 | */ | ||
| 94 | static inline unsigned long nested_read_cr0(struct vmcs12 *fields) | ||
| 95 | { | ||
| 96 | return (fields->guest_cr0 & ~fields->cr0_guest_host_mask) | | ||
| 97 | (fields->cr0_read_shadow & fields->cr0_guest_host_mask); | ||
| 98 | } | ||
| 99 | static inline unsigned long nested_read_cr4(struct vmcs12 *fields) | ||
| 100 | { | ||
| 101 | return (fields->guest_cr4 & ~fields->cr4_guest_host_mask) | | ||
| 102 | (fields->cr4_read_shadow & fields->cr4_guest_host_mask); | ||
| 103 | } | ||
| 104 | |||
| 105 | static inline unsigned nested_cpu_vmx_misc_cr3_count(struct kvm_vcpu *vcpu) | ||
| 106 | { | ||
| 107 | return vmx_misc_cr3_count(to_vmx(vcpu)->nested.msrs.misc_low); | ||
| 108 | } | ||
| 109 | |||
| 110 | /* | ||
| 111 | * Do the virtual VMX capability MSRs specify that L1 can use VMWRITE | ||
| 112 | * to modify any valid field of the VMCS, or are the VM-exit | ||
| 113 | * information fields read-only? | ||
| 114 | */ | ||
| 115 | static inline bool nested_cpu_has_vmwrite_any_field(struct kvm_vcpu *vcpu) | ||
| 116 | { | ||
| 117 | return to_vmx(vcpu)->nested.msrs.misc_low & | ||
| 118 | MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS; | ||
| 119 | } | ||
| 120 | |||
| 121 | static inline bool nested_cpu_has_zero_length_injection(struct kvm_vcpu *vcpu) | ||
| 122 | { | ||
| 123 | return to_vmx(vcpu)->nested.msrs.misc_low & VMX_MISC_ZERO_LEN_INS; | ||
| 124 | } | ||
| 125 | |||
| 126 | static inline bool nested_cpu_supports_monitor_trap_flag(struct kvm_vcpu *vcpu) | ||
| 127 | { | ||
| 128 | return to_vmx(vcpu)->nested.msrs.procbased_ctls_high & | ||
| 129 | CPU_BASED_MONITOR_TRAP_FLAG; | ||
| 130 | } | ||
| 131 | |||
| 132 | static inline bool nested_cpu_has_vmx_shadow_vmcs(struct kvm_vcpu *vcpu) | ||
| 133 | { | ||
| 134 | return to_vmx(vcpu)->nested.msrs.secondary_ctls_high & | ||
| 135 | SECONDARY_EXEC_SHADOW_VMCS; | ||
| 136 | } | ||
| 137 | |||
| 138 | static inline bool nested_cpu_has(struct vmcs12 *vmcs12, u32 bit) | ||
| 139 | { | ||
| 140 | return vmcs12->cpu_based_vm_exec_control & bit; | ||
| 141 | } | ||
| 142 | |||
| 143 | static inline bool nested_cpu_has2(struct vmcs12 *vmcs12, u32 bit) | ||
| 144 | { | ||
| 145 | return (vmcs12->cpu_based_vm_exec_control & | ||
| 146 | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) && | ||
| 147 | (vmcs12->secondary_vm_exec_control & bit); | ||
| 148 | } | ||
| 149 | |||
| 150 | static inline bool nested_cpu_has_preemption_timer(struct vmcs12 *vmcs12) | ||
| 151 | { | ||
| 152 | return vmcs12->pin_based_vm_exec_control & | ||
| 153 | PIN_BASED_VMX_PREEMPTION_TIMER; | ||
| 154 | } | ||
| 155 | |||
| 156 | static inline bool nested_cpu_has_nmi_exiting(struct vmcs12 *vmcs12) | ||
| 157 | { | ||
| 158 | return vmcs12->pin_based_vm_exec_control & PIN_BASED_NMI_EXITING; | ||
| 159 | } | ||
| 160 | |||
| 161 | static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12) | ||
| 162 | { | ||
| 163 | return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS; | ||
| 164 | } | ||
| 165 | |||
| 166 | static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12) | ||
| 167 | { | ||
| 168 | return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT); | ||
| 169 | } | ||
| 170 | |||
| 171 | static inline bool nested_cpu_has_xsaves(struct vmcs12 *vmcs12) | ||
| 172 | { | ||
| 173 | return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES); | ||
| 174 | } | ||
| 175 | |||
| 176 | static inline bool nested_cpu_has_pml(struct vmcs12 *vmcs12) | ||
| 177 | { | ||
| 178 | return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_PML); | ||
| 179 | } | ||
| 180 | |||
| 181 | static inline bool nested_cpu_has_virt_x2apic_mode(struct vmcs12 *vmcs12) | ||
| 182 | { | ||
| 183 | return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE); | ||
| 184 | } | ||
| 185 | |||
| 186 | static inline bool nested_cpu_has_vpid(struct vmcs12 *vmcs12) | ||
| 187 | { | ||
| 188 | return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VPID); | ||
| 189 | } | ||
| 190 | |||
| 191 | static inline bool nested_cpu_has_apic_reg_virt(struct vmcs12 *vmcs12) | ||
| 192 | { | ||
| 193 | return nested_cpu_has2(vmcs12, SECONDARY_EXEC_APIC_REGISTER_VIRT); | ||
| 194 | } | ||
| 195 | |||
| 196 | static inline bool nested_cpu_has_vid(struct vmcs12 *vmcs12) | ||
| 197 | { | ||
| 198 | return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); | ||
| 199 | } | ||
| 200 | |||
| 201 | static inline bool nested_cpu_has_posted_intr(struct vmcs12 *vmcs12) | ||
| 202 | { | ||
| 203 | return vmcs12->pin_based_vm_exec_control & PIN_BASED_POSTED_INTR; | ||
| 204 | } | ||
| 205 | |||
| 206 | static inline bool nested_cpu_has_vmfunc(struct vmcs12 *vmcs12) | ||
| 207 | { | ||
| 208 | return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VMFUNC); | ||
| 209 | } | ||
| 210 | |||
| 211 | static inline bool nested_cpu_has_eptp_switching(struct vmcs12 *vmcs12) | ||
| 212 | { | ||
| 213 | return nested_cpu_has_vmfunc(vmcs12) && | ||
| 214 | (vmcs12->vm_function_control & | ||
| 215 | VMX_VMFUNC_EPTP_SWITCHING); | ||
| 216 | } | ||
| 217 | |||
| 218 | static inline bool nested_cpu_has_shadow_vmcs(struct vmcs12 *vmcs12) | ||
| 219 | { | ||
| 220 | return nested_cpu_has2(vmcs12, SECONDARY_EXEC_SHADOW_VMCS); | ||
| 221 | } | ||
| 222 | |||
| 223 | static inline bool nested_cpu_has_save_preemption_timer(struct vmcs12 *vmcs12) | ||
| 224 | { | ||
| 225 | return vmcs12->vm_exit_controls & | ||
| 226 | VM_EXIT_SAVE_VMX_PREEMPTION_TIMER; | ||
| 227 | } | ||
| 228 | |||
| 229 | /* | ||
| 230 | * In nested virtualization, check if L1 asked to exit on external interrupts. | ||
| 231 | * For most existing hypervisors, this will always return true. | ||
| 232 | */ | ||
| 233 | static inline bool nested_exit_on_intr(struct kvm_vcpu *vcpu) | ||
| 234 | { | ||
| 235 | return get_vmcs12(vcpu)->pin_based_vm_exec_control & | ||
| 236 | PIN_BASED_EXT_INTR_MASK; | ||
| 237 | } | ||
| 238 | |||
| 239 | /* | ||
| 240 | * if fixed0[i] == 1: val[i] must be 1 | ||
| 241 | * if fixed1[i] == 0: val[i] must be 0 | ||
| 242 | */ | ||
| 243 | static inline bool fixed_bits_valid(u64 val, u64 fixed0, u64 fixed1) | ||
| 244 | { | ||
| 245 | return ((val & fixed1) | fixed0) == val; | ||
| 246 | } | ||
| 247 | |||
| 248 | static bool nested_guest_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val) | ||
| 249 | { | ||
| 250 | u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr0_fixed0; | ||
| 251 | u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr0_fixed1; | ||
| 252 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); | ||
| 253 | |||
| 254 | if (to_vmx(vcpu)->nested.msrs.secondary_ctls_high & | ||
| 255 | SECONDARY_EXEC_UNRESTRICTED_GUEST && | ||
| 256 | nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST)) | ||
| 257 | fixed0 &= ~(X86_CR0_PE | X86_CR0_PG); | ||
| 258 | |||
| 259 | return fixed_bits_valid(val, fixed0, fixed1); | ||
| 260 | } | ||
| 261 | |||
| 262 | static bool nested_host_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val) | ||
| 263 | { | ||
| 264 | u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr0_fixed0; | ||
| 265 | u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr0_fixed1; | ||
| 266 | |||
| 267 | return fixed_bits_valid(val, fixed0, fixed1); | ||
| 268 | } | ||
| 269 | |||
| 270 | static bool nested_cr4_valid(struct kvm_vcpu *vcpu, unsigned long val) | ||
| 271 | { | ||
| 272 | u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr4_fixed0; | ||
| 273 | u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr4_fixed1; | ||
| 274 | |||
| 275 | return fixed_bits_valid(val, fixed0, fixed1); | ||
| 276 | } | ||
| 277 | |||
| 278 | /* No difference in the restrictions on guest and host CR4 in VMX operation. */ | ||
| 279 | #define nested_guest_cr4_valid nested_cr4_valid | ||
| 280 | #define nested_host_cr4_valid nested_cr4_valid | ||
| 281 | |||
| 282 | #endif /* __KVM_X86_VMX_NESTED_H */ | ||
diff --git a/arch/x86/kvm/vmx/ops.h b/arch/x86/kvm/vmx/ops.h new file mode 100644 index 000000000000..b8e50f76fefc --- /dev/null +++ b/arch/x86/kvm/vmx/ops.h | |||
| @@ -0,0 +1,285 @@ | |||
| 1 | /* SPDX-License-Identifier: GPL-2.0 */ | ||
| 2 | #ifndef __KVM_X86_VMX_INSN_H | ||
| 3 | #define __KVM_X86_VMX_INSN_H | ||
| 4 | |||
| 5 | #include <linux/nospec.h> | ||
| 6 | |||
| 7 | #include <asm/kvm_host.h> | ||
| 8 | #include <asm/vmx.h> | ||
| 9 | |||
| 10 | #include "evmcs.h" | ||
| 11 | #include "vmcs.h" | ||
| 12 | |||
| 13 | #define __ex(x) __kvm_handle_fault_on_reboot(x) | ||
| 14 | #define __ex_clear(x, reg) \ | ||
| 15 | ____kvm_handle_fault_on_reboot(x, "xor " reg ", " reg) | ||
| 16 | |||
| 17 | static __always_inline void vmcs_check16(unsigned long field) | ||
| 18 | { | ||
| 19 | BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2000, | ||
| 20 | "16-bit accessor invalid for 64-bit field"); | ||
| 21 | BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001, | ||
| 22 | "16-bit accessor invalid for 64-bit high field"); | ||
| 23 | BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000, | ||
| 24 | "16-bit accessor invalid for 32-bit high field"); | ||
| 25 | BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000, | ||
| 26 | "16-bit accessor invalid for natural width field"); | ||
| 27 | } | ||
| 28 | |||
| 29 | static __always_inline void vmcs_check32(unsigned long field) | ||
| 30 | { | ||
| 31 | BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0, | ||
| 32 | "32-bit accessor invalid for 16-bit field"); | ||
| 33 | BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000, | ||
| 34 | "32-bit accessor invalid for natural width field"); | ||
| 35 | } | ||
| 36 | |||
| 37 | static __always_inline void vmcs_check64(unsigned long field) | ||
| 38 | { | ||
| 39 | BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0, | ||
| 40 | "64-bit accessor invalid for 16-bit field"); | ||
| 41 | BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001, | ||
| 42 | "64-bit accessor invalid for 64-bit high field"); | ||
| 43 | BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000, | ||
| 44 | "64-bit accessor invalid for 32-bit field"); | ||
| 45 | BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000, | ||
| 46 | "64-bit accessor invalid for natural width field"); | ||
| 47 | } | ||
| 48 | |||
| 49 | static __always_inline void vmcs_checkl(unsigned long field) | ||
| 50 | { | ||
| 51 | BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0, | ||
| 52 | "Natural width accessor invalid for 16-bit field"); | ||
| 53 | BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2000, | ||
| 54 | "Natural width accessor invalid for 64-bit field"); | ||
| 55 | BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001, | ||
| 56 | "Natural width accessor invalid for 64-bit high field"); | ||
| 57 | BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000, | ||
| 58 | "Natural width accessor invalid for 32-bit field"); | ||
| 59 | } | ||
| 60 | |||
| 61 | static __always_inline unsigned long __vmcs_readl(unsigned long field) | ||
| 62 | { | ||
| 63 | unsigned long value; | ||
| 64 | |||
| 65 | asm volatile (__ex_clear("vmread %1, %0", "%k0") | ||
| 66 | : "=r"(value) : "r"(field)); | ||
| 67 | return value; | ||
| 68 | } | ||
| 69 | |||
| 70 | static __always_inline u16 vmcs_read16(unsigned long field) | ||
| 71 | { | ||
| 72 | vmcs_check16(field); | ||
| 73 | if (static_branch_unlikely(&enable_evmcs)) | ||
| 74 | return evmcs_read16(field); | ||
| 75 | return __vmcs_readl(field); | ||
| 76 | } | ||
| 77 | |||
| 78 | static __always_inline u32 vmcs_read32(unsigned long field) | ||
| 79 | { | ||
| 80 | vmcs_check32(field); | ||
| 81 | if (static_branch_unlikely(&enable_evmcs)) | ||
| 82 | return evmcs_read32(field); | ||
| 83 | return __vmcs_readl(field); | ||
| 84 | } | ||
| 85 | |||
| 86 | static __always_inline u64 vmcs_read64(unsigned long field) | ||
| 87 | { | ||
| 88 | vmcs_check64(field); | ||
| 89 | if (static_branch_unlikely(&enable_evmcs)) | ||
| 90 | return evmcs_read64(field); | ||
| 91 | #ifdef CONFIG_X86_64 | ||
| 92 | return __vmcs_readl(field); | ||
| 93 | #else | ||
| 94 | return __vmcs_readl(field) | ((u64)__vmcs_readl(field+1) << 32); | ||
| 95 | #endif | ||
| 96 | } | ||
| 97 | |||
| 98 | static __always_inline unsigned long vmcs_readl(unsigned long field) | ||
| 99 | { | ||
| 100 | vmcs_checkl(field); | ||
| 101 | if (static_branch_unlikely(&enable_evmcs)) | ||
| 102 | return evmcs_read64(field); | ||
| 103 | return __vmcs_readl(field); | ||
| 104 | } | ||
| 105 | |||
| 106 | static noinline void vmwrite_error(unsigned long field, unsigned long value) | ||
| 107 | { | ||
| 108 | printk(KERN_ERR "vmwrite error: reg %lx value %lx (err %d)\n", | ||
| 109 | field, value, vmcs_read32(VM_INSTRUCTION_ERROR)); | ||
| 110 | dump_stack(); | ||
| 111 | } | ||
| 112 | |||
| 113 | static __always_inline void __vmcs_writel(unsigned long field, unsigned long value) | ||
| 114 | { | ||
| 115 | bool error; | ||
| 116 | |||
| 117 | asm volatile (__ex("vmwrite %2, %1") CC_SET(na) | ||
| 118 | : CC_OUT(na) (error) : "r"(field), "rm"(value)); | ||
| 119 | if (unlikely(error)) | ||
| 120 | vmwrite_error(field, value); | ||
| 121 | } | ||
| 122 | |||
| 123 | static __always_inline void vmcs_write16(unsigned long field, u16 value) | ||
| 124 | { | ||
| 125 | vmcs_check16(field); | ||
| 126 | if (static_branch_unlikely(&enable_evmcs)) | ||
| 127 | return evmcs_write16(field, value); | ||
| 128 | |||
| 129 | __vmcs_writel(field, value); | ||
| 130 | } | ||
| 131 | |||
| 132 | static __always_inline void vmcs_write32(unsigned long field, u32 value) | ||
| 133 | { | ||
| 134 | vmcs_check32(field); | ||
| 135 | if (static_branch_unlikely(&enable_evmcs)) | ||
| 136 | return evmcs_write32(field, value); | ||
| 137 | |||
| 138 | __vmcs_writel(field, value); | ||
| 139 | } | ||
| 140 | |||
| 141 | static __always_inline void vmcs_write64(unsigned long field, u64 value) | ||
| 142 | { | ||
| 143 | vmcs_check64(field); | ||
| 144 | if (static_branch_unlikely(&enable_evmcs)) | ||
| 145 | return evmcs_write64(field, value); | ||
| 146 | |||
| 147 | __vmcs_writel(field, value); | ||
| 148 | #ifndef CONFIG_X86_64 | ||
| 149 | asm volatile (""); | ||
| 150 | __vmcs_writel(field+1, value >> 32); | ||
| 151 | #endif | ||
| 152 | } | ||
| 153 | |||
| 154 | static __always_inline void vmcs_writel(unsigned long field, unsigned long value) | ||
| 155 | { | ||
| 156 | vmcs_checkl(field); | ||
| 157 | if (static_branch_unlikely(&enable_evmcs)) | ||
| 158 | return evmcs_write64(field, value); | ||
| 159 | |||
| 160 | __vmcs_writel(field, value); | ||
| 161 | } | ||
| 162 | |||
| 163 | static __always_inline void vmcs_clear_bits(unsigned long field, u32 mask) | ||
| 164 | { | ||
| 165 | BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000, | ||
| 166 | "vmcs_clear_bits does not support 64-bit fields"); | ||
| 167 | if (static_branch_unlikely(&enable_evmcs)) | ||
| 168 | return evmcs_write32(field, evmcs_read32(field) & ~mask); | ||
| 169 | |||
| 170 | __vmcs_writel(field, __vmcs_readl(field) & ~mask); | ||
| 171 | } | ||
| 172 | |||
| 173 | static __always_inline void vmcs_set_bits(unsigned long field, u32 mask) | ||
| 174 | { | ||
| 175 | BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000, | ||
| 176 | "vmcs_set_bits does not support 64-bit fields"); | ||
| 177 | if (static_branch_unlikely(&enable_evmcs)) | ||
| 178 | return evmcs_write32(field, evmcs_read32(field) | mask); | ||
| 179 | |||
| 180 | __vmcs_writel(field, __vmcs_readl(field) | mask); | ||
| 181 | } | ||
| 182 | |||
| 183 | static inline void vmcs_clear(struct vmcs *vmcs) | ||
| 184 | { | ||
| 185 | u64 phys_addr = __pa(vmcs); | ||
| 186 | bool error; | ||
| 187 | |||
| 188 | asm volatile (__ex("vmclear %1") CC_SET(na) | ||
| 189 | : CC_OUT(na) (error) : "m"(phys_addr)); | ||
| 190 | if (unlikely(error)) | ||
| 191 | printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n", | ||
| 192 | vmcs, phys_addr); | ||
| 193 | } | ||
| 194 | |||
| 195 | static inline void vmcs_load(struct vmcs *vmcs) | ||
| 196 | { | ||
| 197 | u64 phys_addr = __pa(vmcs); | ||
| 198 | bool error; | ||
| 199 | |||
| 200 | if (static_branch_unlikely(&enable_evmcs)) | ||
| 201 | return evmcs_load(phys_addr); | ||
| 202 | |||
| 203 | asm volatile (__ex("vmptrld %1") CC_SET(na) | ||
| 204 | : CC_OUT(na) (error) : "m"(phys_addr)); | ||
| 205 | if (unlikely(error)) | ||
| 206 | printk(KERN_ERR "kvm: vmptrld %p/%llx failed\n", | ||
| 207 | vmcs, phys_addr); | ||
| 208 | } | ||
| 209 | |||
| 210 | static inline void __invvpid(unsigned long ext, u16 vpid, gva_t gva) | ||
| 211 | { | ||
| 212 | struct { | ||
| 213 | u64 vpid : 16; | ||
| 214 | u64 rsvd : 48; | ||
| 215 | u64 gva; | ||
| 216 | } operand = { vpid, 0, gva }; | ||
| 217 | bool error; | ||
| 218 | |||
| 219 | asm volatile (__ex("invvpid %2, %1") CC_SET(na) | ||
| 220 | : CC_OUT(na) (error) : "r"(ext), "m"(operand)); | ||
| 221 | BUG_ON(error); | ||
| 222 | } | ||
| 223 | |||
| 224 | static inline void __invept(unsigned long ext, u64 eptp, gpa_t gpa) | ||
| 225 | { | ||
| 226 | struct { | ||
| 227 | u64 eptp, gpa; | ||
| 228 | } operand = {eptp, gpa}; | ||
| 229 | bool error; | ||
| 230 | |||
| 231 | asm volatile (__ex("invept %2, %1") CC_SET(na) | ||
| 232 | : CC_OUT(na) (error) : "r"(ext), "m"(operand)); | ||
| 233 | BUG_ON(error); | ||
| 234 | } | ||
| 235 | |||
| 236 | static inline bool vpid_sync_vcpu_addr(int vpid, gva_t addr) | ||
| 237 | { | ||
| 238 | if (vpid == 0) | ||
| 239 | return true; | ||
| 240 | |||
| 241 | if (cpu_has_vmx_invvpid_individual_addr()) { | ||
| 242 | __invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR, vpid, addr); | ||
| 243 | return true; | ||
| 244 | } | ||
| 245 | |||
| 246 | return false; | ||
| 247 | } | ||
| 248 | |||
| 249 | static inline void vpid_sync_vcpu_single(int vpid) | ||
| 250 | { | ||
| 251 | if (vpid == 0) | ||
| 252 | return; | ||
| 253 | |||
| 254 | if (cpu_has_vmx_invvpid_single()) | ||
| 255 | __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vpid, 0); | ||
| 256 | } | ||
| 257 | |||
| 258 | static inline void vpid_sync_vcpu_global(void) | ||
| 259 | { | ||
| 260 | if (cpu_has_vmx_invvpid_global()) | ||
| 261 | __invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0); | ||
| 262 | } | ||
| 263 | |||
| 264 | static inline void vpid_sync_context(int vpid) | ||
| 265 | { | ||
| 266 | if (cpu_has_vmx_invvpid_single()) | ||
| 267 | vpid_sync_vcpu_single(vpid); | ||
| 268 | else | ||
| 269 | vpid_sync_vcpu_global(); | ||
| 270 | } | ||
| 271 | |||
| 272 | static inline void ept_sync_global(void) | ||
| 273 | { | ||
| 274 | __invept(VMX_EPT_EXTENT_GLOBAL, 0, 0); | ||
| 275 | } | ||
| 276 | |||
| 277 | static inline void ept_sync_context(u64 eptp) | ||
| 278 | { | ||
| 279 | if (cpu_has_vmx_invept_context()) | ||
| 280 | __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0); | ||
| 281 | else | ||
| 282 | ept_sync_global(); | ||
| 283 | } | ||
| 284 | |||
| 285 | #endif /* __KVM_X86_VMX_INSN_H */ | ||
diff --git a/arch/x86/kvm/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 5ab4a364348e..5ab4a364348e 100644 --- a/arch/x86/kvm/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c | |||
diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h new file mode 100644 index 000000000000..6def3ba88e3b --- /dev/null +++ b/arch/x86/kvm/vmx/vmcs.h | |||
| @@ -0,0 +1,136 @@ | |||
| 1 | /* SPDX-License-Identifier: GPL-2.0 */ | ||
| 2 | #ifndef __KVM_X86_VMX_VMCS_H | ||
| 3 | #define __KVM_X86_VMX_VMCS_H | ||
| 4 | |||
| 5 | #include <linux/ktime.h> | ||
| 6 | #include <linux/list.h> | ||
| 7 | #include <linux/nospec.h> | ||
| 8 | |||
| 9 | #include <asm/kvm.h> | ||
| 10 | #include <asm/vmx.h> | ||
| 11 | |||
| 12 | #include "capabilities.h" | ||
| 13 | |||
| 14 | struct vmcs_hdr { | ||
| 15 | u32 revision_id:31; | ||
| 16 | u32 shadow_vmcs:1; | ||
| 17 | }; | ||
| 18 | |||
| 19 | struct vmcs { | ||
| 20 | struct vmcs_hdr hdr; | ||
| 21 | u32 abort; | ||
| 22 | char data[0]; | ||
| 23 | }; | ||
| 24 | |||
| 25 | DECLARE_PER_CPU(struct vmcs *, current_vmcs); | ||
| 26 | |||
| 27 | /* | ||
| 28 | * vmcs_host_state tracks registers that are loaded from the VMCS on VMEXIT | ||
| 29 | * and whose values change infrequently, but are not constant. I.e. this is | ||
| 30 | * used as a write-through cache of the corresponding VMCS fields. | ||
| 31 | */ | ||
| 32 | struct vmcs_host_state { | ||
| 33 | unsigned long cr3; /* May not match real cr3 */ | ||
| 34 | unsigned long cr4; /* May not match real cr4 */ | ||
| 35 | unsigned long gs_base; | ||
| 36 | unsigned long fs_base; | ||
| 37 | |||
| 38 | u16 fs_sel, gs_sel, ldt_sel; | ||
| 39 | #ifdef CONFIG_X86_64 | ||
| 40 | u16 ds_sel, es_sel; | ||
| 41 | #endif | ||
| 42 | }; | ||
| 43 | |||
| 44 | /* | ||
| 45 | * Track a VMCS that may be loaded on a certain CPU. If it is (cpu!=-1), also | ||
| 46 | * remember whether it was VMLAUNCHed, and maintain a linked list of all VMCSs | ||
| 47 | * loaded on this CPU (so we can clear them if the CPU goes down). | ||
| 48 | */ | ||
| 49 | struct loaded_vmcs { | ||
| 50 | struct vmcs *vmcs; | ||
| 51 | struct vmcs *shadow_vmcs; | ||
| 52 | int cpu; | ||
| 53 | bool launched; | ||
| 54 | bool nmi_known_unmasked; | ||
| 55 | bool hv_timer_armed; | ||
| 56 | /* Support for vnmi-less CPUs */ | ||
| 57 | int soft_vnmi_blocked; | ||
| 58 | ktime_t entry_time; | ||
| 59 | s64 vnmi_blocked_time; | ||
| 60 | unsigned long *msr_bitmap; | ||
| 61 | struct list_head loaded_vmcss_on_cpu_link; | ||
| 62 | struct vmcs_host_state host_state; | ||
| 63 | }; | ||
| 64 | |||
| 65 | static inline bool is_exception_n(u32 intr_info, u8 vector) | ||
| 66 | { | ||
| 67 | return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | | ||
| 68 | INTR_INFO_VALID_MASK)) == | ||
| 69 | (INTR_TYPE_HARD_EXCEPTION | vector | INTR_INFO_VALID_MASK); | ||
| 70 | } | ||
| 71 | |||
| 72 | static inline bool is_debug(u32 intr_info) | ||
| 73 | { | ||
| 74 | return is_exception_n(intr_info, DB_VECTOR); | ||
| 75 | } | ||
| 76 | |||
| 77 | static inline bool is_breakpoint(u32 intr_info) | ||
| 78 | { | ||
| 79 | return is_exception_n(intr_info, BP_VECTOR); | ||
| 80 | } | ||
| 81 | |||
| 82 | static inline bool is_page_fault(u32 intr_info) | ||
| 83 | { | ||
| 84 | return is_exception_n(intr_info, PF_VECTOR); | ||
| 85 | } | ||
| 86 | |||
| 87 | static inline bool is_invalid_opcode(u32 intr_info) | ||
| 88 | { | ||
| 89 | return is_exception_n(intr_info, UD_VECTOR); | ||
| 90 | } | ||
| 91 | |||
| 92 | static inline bool is_gp_fault(u32 intr_info) | ||
| 93 | { | ||
| 94 | return is_exception_n(intr_info, GP_VECTOR); | ||
| 95 | } | ||
| 96 | |||
| 97 | static inline bool is_machine_check(u32 intr_info) | ||
| 98 | { | ||
| 99 | return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | | ||
| 100 | INTR_INFO_VALID_MASK)) == | ||
| 101 | (INTR_TYPE_HARD_EXCEPTION | MC_VECTOR | INTR_INFO_VALID_MASK); | ||
| 102 | } | ||
| 103 | |||
| 104 | /* Undocumented: icebp/int1 */ | ||
| 105 | static inline bool is_icebp(u32 intr_info) | ||
| 106 | { | ||
| 107 | return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) | ||
| 108 | == (INTR_TYPE_PRIV_SW_EXCEPTION | INTR_INFO_VALID_MASK); | ||
| 109 | } | ||
| 110 | |||
| 111 | static inline bool is_nmi(u32 intr_info) | ||
| 112 | { | ||
| 113 | return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) | ||
| 114 | == (INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK); | ||
| 115 | } | ||
| 116 | |||
| 117 | enum vmcs_field_width { | ||
| 118 | VMCS_FIELD_WIDTH_U16 = 0, | ||
| 119 | VMCS_FIELD_WIDTH_U64 = 1, | ||
| 120 | VMCS_FIELD_WIDTH_U32 = 2, | ||
| 121 | VMCS_FIELD_WIDTH_NATURAL_WIDTH = 3 | ||
| 122 | }; | ||
| 123 | |||
| 124 | static inline int vmcs_field_width(unsigned long field) | ||
| 125 | { | ||
| 126 | if (0x1 & field) /* the *_HIGH fields are all 32 bit */ | ||
| 127 | return VMCS_FIELD_WIDTH_U32; | ||
| 128 | return (field >> 13) & 0x3; | ||
| 129 | } | ||
| 130 | |||
| 131 | static inline int vmcs_field_readonly(unsigned long field) | ||
| 132 | { | ||
| 133 | return (((field >> 10) & 0x3) == 1); | ||
| 134 | } | ||
| 135 | |||
| 136 | #endif /* __KVM_X86_VMX_VMCS_H */ | ||
diff --git a/arch/x86/kvm/vmx/vmcs12.c b/arch/x86/kvm/vmx/vmcs12.c new file mode 100644 index 000000000000..53dfb401316d --- /dev/null +++ b/arch/x86/kvm/vmx/vmcs12.c | |||
| @@ -0,0 +1,157 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0 | ||
| 2 | |||
| 3 | #include "vmcs12.h" | ||
| 4 | |||
| 5 | #define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n))))) | ||
| 6 | #define VMCS12_OFFSET(x) offsetof(struct vmcs12, x) | ||
| 7 | #define FIELD(number, name) [ROL16(number, 6)] = VMCS12_OFFSET(name) | ||
| 8 | #define FIELD64(number, name) \ | ||
| 9 | FIELD(number, name), \ | ||
| 10 | [ROL16(number##_HIGH, 6)] = VMCS12_OFFSET(name) + sizeof(u32) | ||
| 11 | |||
| 12 | const unsigned short vmcs_field_to_offset_table[] = { | ||
| 13 | FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id), | ||
| 14 | FIELD(POSTED_INTR_NV, posted_intr_nv), | ||
| 15 | FIELD(GUEST_ES_SELECTOR, guest_es_selector), | ||
| 16 | FIELD(GUEST_CS_SELECTOR, guest_cs_selector), | ||
| 17 | FIELD(GUEST_SS_SELECTOR, guest_ss_selector), | ||
| 18 | FIELD(GUEST_DS_SELECTOR, guest_ds_selector), | ||
| 19 | FIELD(GUEST_FS_SELECTOR, guest_fs_selector), | ||
| 20 | FIELD(GUEST_GS_SELECTOR, guest_gs_selector), | ||
| 21 | FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector), | ||
| 22 | FIELD(GUEST_TR_SELECTOR, guest_tr_selector), | ||
| 23 | FIELD(GUEST_INTR_STATUS, guest_intr_status), | ||
| 24 | FIELD(GUEST_PML_INDEX, guest_pml_index), | ||
| 25 | FIELD(HOST_ES_SELECTOR, host_es_selector), | ||
| 26 | FIELD(HOST_CS_SELECTOR, host_cs_selector), | ||
| 27 | FIELD(HOST_SS_SELECTOR, host_ss_selector), | ||
| 28 | FIELD(HOST_DS_SELECTOR, host_ds_selector), | ||
| 29 | FIELD(HOST_FS_SELECTOR, host_fs_selector), | ||
| 30 | FIELD(HOST_GS_SELECTOR, host_gs_selector), | ||
| 31 | FIELD(HOST_TR_SELECTOR, host_tr_selector), | ||
| 32 | FIELD64(IO_BITMAP_A, io_bitmap_a), | ||
| 33 | FIELD64(IO_BITMAP_B, io_bitmap_b), | ||
| 34 | FIELD64(MSR_BITMAP, msr_bitmap), | ||
| 35 | FIELD64(VM_EXIT_MSR_STORE_ADDR, vm_exit_msr_store_addr), | ||
| 36 | FIELD64(VM_EXIT_MSR_LOAD_ADDR, vm_exit_msr_load_addr), | ||
| 37 | FIELD64(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr), | ||
| 38 | FIELD64(PML_ADDRESS, pml_address), | ||
| 39 | FIELD64(TSC_OFFSET, tsc_offset), | ||
| 40 | FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr), | ||
| 41 | FIELD64(APIC_ACCESS_ADDR, apic_access_addr), | ||
| 42 | FIELD64(POSTED_INTR_DESC_ADDR, posted_intr_desc_addr), | ||
| 43 | FIELD64(VM_FUNCTION_CONTROL, vm_function_control), | ||
| 44 | FIELD64(EPT_POINTER, ept_pointer), | ||
| 45 | FIELD64(EOI_EXIT_BITMAP0, eoi_exit_bitmap0), | ||
| 46 | FIELD64(EOI_EXIT_BITMAP1, eoi_exit_bitmap1), | ||
| 47 | FIELD64(EOI_EXIT_BITMAP2, eoi_exit_bitmap2), | ||
| 48 | FIELD64(EOI_EXIT_BITMAP3, eoi_exit_bitmap3), | ||
| 49 | FIELD64(EPTP_LIST_ADDRESS, eptp_list_address), | ||
| 50 | FIELD64(VMREAD_BITMAP, vmread_bitmap), | ||
| 51 | FIELD64(VMWRITE_BITMAP, vmwrite_bitmap), | ||
| 52 | FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap), | ||
| 53 | FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address), | ||
| 54 | FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer), | ||
| 55 | FIELD64(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl), | ||
| 56 | FIELD64(GUEST_IA32_PAT, guest_ia32_pat), | ||
| 57 | FIELD64(GUEST_IA32_EFER, guest_ia32_efer), | ||
| 58 | FIELD64(GUEST_IA32_PERF_GLOBAL_CTRL, guest_ia32_perf_global_ctrl), | ||
| 59 | FIELD64(GUEST_PDPTR0, guest_pdptr0), | ||
| 60 | FIELD64(GUEST_PDPTR1, guest_pdptr1), | ||
| 61 | FIELD64(GUEST_PDPTR2, guest_pdptr2), | ||
| 62 | FIELD64(GUEST_PDPTR3, guest_pdptr3), | ||
| 63 | FIELD64(GUEST_BNDCFGS, guest_bndcfgs), | ||
| 64 | FIELD64(HOST_IA32_PAT, host_ia32_pat), | ||
| 65 | FIELD64(HOST_IA32_EFER, host_ia32_efer), | ||
| 66 | FIELD64(HOST_IA32_PERF_GLOBAL_CTRL, host_ia32_perf_global_ctrl), | ||
| 67 | FIELD(PIN_BASED_VM_EXEC_CONTROL, pin_based_vm_exec_control), | ||
| 68 | FIELD(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control), | ||
| 69 | FIELD(EXCEPTION_BITMAP, exception_bitmap), | ||
| 70 | FIELD(PAGE_FAULT_ERROR_CODE_MASK, page_fault_error_code_mask), | ||
| 71 | FIELD(PAGE_FAULT_ERROR_CODE_MATCH, page_fault_error_code_match), | ||
| 72 | FIELD(CR3_TARGET_COUNT, cr3_target_count), | ||
| 73 | FIELD(VM_EXIT_CONTROLS, vm_exit_controls), | ||
| 74 | FIELD(VM_EXIT_MSR_STORE_COUNT, vm_exit_msr_store_count), | ||
| 75 | FIELD(VM_EXIT_MSR_LOAD_COUNT, vm_exit_msr_load_count), | ||
| 76 | FIELD(VM_ENTRY_CONTROLS, vm_entry_controls), | ||
| 77 | FIELD(VM_ENTRY_MSR_LOAD_COUNT, vm_entry_msr_load_count), | ||
| 78 | FIELD(VM_ENTRY_INTR_INFO_FIELD, vm_entry_intr_info_field), | ||
| 79 | FIELD(VM_ENTRY_EXCEPTION_ERROR_CODE, vm_entry_exception_error_code), | ||
| 80 | FIELD(VM_ENTRY_INSTRUCTION_LEN, vm_entry_instruction_len), | ||
| 81 | FIELD(TPR_THRESHOLD, tpr_threshold), | ||
| 82 | FIELD(SECONDARY_VM_EXEC_CONTROL, secondary_vm_exec_control), | ||
| 83 | FIELD(VM_INSTRUCTION_ERROR, vm_instruction_error), | ||
| 84 | FIELD(VM_EXIT_REASON, vm_exit_reason), | ||
| 85 | FIELD(VM_EXIT_INTR_INFO, vm_exit_intr_info), | ||
| 86 | FIELD(VM_EXIT_INTR_ERROR_CODE, vm_exit_intr_error_code), | ||
| 87 | FIELD(IDT_VECTORING_INFO_FIELD, idt_vectoring_info_field), | ||
| 88 | FIELD(IDT_VECTORING_ERROR_CODE, idt_vectoring_error_code), | ||
| 89 | FIELD(VM_EXIT_INSTRUCTION_LEN, vm_exit_instruction_len), | ||
| 90 | FIELD(VMX_INSTRUCTION_INFO, vmx_instruction_info), | ||
| 91 | FIELD(GUEST_ES_LIMIT, guest_es_limit), | ||
| 92 | FIELD(GUEST_CS_LIMIT, guest_cs_limit), | ||
| 93 | FIELD(GUEST_SS_LIMIT, guest_ss_limit), | ||
| 94 | FIELD(GUEST_DS_LIMIT, guest_ds_limit), | ||
| 95 | FIELD(GUEST_FS_LIMIT, guest_fs_limit), | ||
| 96 | FIELD(GUEST_GS_LIMIT, guest_gs_limit), | ||
| 97 | FIELD(GUEST_LDTR_LIMIT, guest_ldtr_limit), | ||
| 98 | FIELD(GUEST_TR_LIMIT, guest_tr_limit), | ||
| 99 | FIELD(GUEST_GDTR_LIMIT, guest_gdtr_limit), | ||
| 100 | FIELD(GUEST_IDTR_LIMIT, guest_idtr_limit), | ||
| 101 | FIELD(GUEST_ES_AR_BYTES, guest_es_ar_bytes), | ||
| 102 | FIELD(GUEST_CS_AR_BYTES, guest_cs_ar_bytes), | ||
| 103 | FIELD(GUEST_SS_AR_BYTES, guest_ss_ar_bytes), | ||
| 104 | FIELD(GUEST_DS_AR_BYTES, guest_ds_ar_bytes), | ||
| 105 | FIELD(GUEST_FS_AR_BYTES, guest_fs_ar_bytes), | ||
| 106 | FIELD(GUEST_GS_AR_BYTES, guest_gs_ar_bytes), | ||
| 107 | FIELD(GUEST_LDTR_AR_BYTES, guest_ldtr_ar_bytes), | ||
| 108 | FIELD(GUEST_TR_AR_BYTES, guest_tr_ar_bytes), | ||
| 109 | FIELD(GUEST_INTERRUPTIBILITY_INFO, guest_interruptibility_info), | ||
| 110 | FIELD(GUEST_ACTIVITY_STATE, guest_activity_state), | ||
| 111 | FIELD(GUEST_SYSENTER_CS, guest_sysenter_cs), | ||
| 112 | FIELD(HOST_IA32_SYSENTER_CS, host_ia32_sysenter_cs), | ||
| 113 | FIELD(VMX_PREEMPTION_TIMER_VALUE, vmx_preemption_timer_value), | ||
| 114 | FIELD(CR0_GUEST_HOST_MASK, cr0_guest_host_mask), | ||
| 115 | FIELD(CR4_GUEST_HOST_MASK, cr4_guest_host_mask), | ||
| 116 | FIELD(CR0_READ_SHADOW, cr0_read_shadow), | ||
| 117 | FIELD(CR4_READ_SHADOW, cr4_read_shadow), | ||
| 118 | FIELD(CR3_TARGET_VALUE0, cr3_target_value0), | ||
| 119 | FIELD(CR3_TARGET_VALUE1, cr3_target_value1), | ||
| 120 | FIELD(CR3_TARGET_VALUE2, cr3_target_value2), | ||
| 121 | FIELD(CR3_TARGET_VALUE3, cr3_target_value3), | ||
| 122 | FIELD(EXIT_QUALIFICATION, exit_qualification), | ||
| 123 | FIELD(GUEST_LINEAR_ADDRESS, guest_linear_address), | ||
| 124 | FIELD(GUEST_CR0, guest_cr0), | ||
| 125 | FIELD(GUEST_CR3, guest_cr3), | ||
| 126 | FIELD(GUEST_CR4, guest_cr4), | ||
| 127 | FIELD(GUEST_ES_BASE, guest_es_base), | ||
| 128 | FIELD(GUEST_CS_BASE, guest_cs_base), | ||
| 129 | FIELD(GUEST_SS_BASE, guest_ss_base), | ||
| 130 | FIELD(GUEST_DS_BASE, guest_ds_base), | ||
| 131 | FIELD(GUEST_FS_BASE, guest_fs_base), | ||
| 132 | FIELD(GUEST_GS_BASE, guest_gs_base), | ||
| 133 | FIELD(GUEST_LDTR_BASE, guest_ldtr_base), | ||
| 134 | FIELD(GUEST_TR_BASE, guest_tr_base), | ||
| 135 | FIELD(GUEST_GDTR_BASE, guest_gdtr_base), | ||
| 136 | FIELD(GUEST_IDTR_BASE, guest_idtr_base), | ||
| 137 | FIELD(GUEST_DR7, guest_dr7), | ||
| 138 | FIELD(GUEST_RSP, guest_rsp), | ||
| 139 | FIELD(GUEST_RIP, guest_rip), | ||
| 140 | FIELD(GUEST_RFLAGS, guest_rflags), | ||
| 141 | FIELD(GUEST_PENDING_DBG_EXCEPTIONS, guest_pending_dbg_exceptions), | ||
| 142 | FIELD(GUEST_SYSENTER_ESP, guest_sysenter_esp), | ||
| 143 | FIELD(GUEST_SYSENTER_EIP, guest_sysenter_eip), | ||
| 144 | FIELD(HOST_CR0, host_cr0), | ||
| 145 | FIELD(HOST_CR3, host_cr3), | ||
| 146 | FIELD(HOST_CR4, host_cr4), | ||
| 147 | FIELD(HOST_FS_BASE, host_fs_base), | ||
| 148 | FIELD(HOST_GS_BASE, host_gs_base), | ||
| 149 | FIELD(HOST_TR_BASE, host_tr_base), | ||
| 150 | FIELD(HOST_GDTR_BASE, host_gdtr_base), | ||
| 151 | FIELD(HOST_IDTR_BASE, host_idtr_base), | ||
| 152 | FIELD(HOST_IA32_SYSENTER_ESP, host_ia32_sysenter_esp), | ||
| 153 | FIELD(HOST_IA32_SYSENTER_EIP, host_ia32_sysenter_eip), | ||
| 154 | FIELD(HOST_RSP, host_rsp), | ||
| 155 | FIELD(HOST_RIP, host_rip), | ||
| 156 | }; | ||
| 157 | const unsigned int nr_vmcs12_fields = ARRAY_SIZE(vmcs_field_to_offset_table); | ||
diff --git a/arch/x86/kvm/vmx/vmcs12.h b/arch/x86/kvm/vmx/vmcs12.h new file mode 100644 index 000000000000..3a742428ad17 --- /dev/null +++ b/arch/x86/kvm/vmx/vmcs12.h | |||
| @@ -0,0 +1,462 @@ | |||
| 1 | /* SPDX-License-Identifier: GPL-2.0 */ | ||
| 2 | #ifndef __KVM_X86_VMX_VMCS12_H | ||
| 3 | #define __KVM_X86_VMX_VMCS12_H | ||
| 4 | |||
| 5 | #include <linux/build_bug.h> | ||
| 6 | |||
| 7 | #include "vmcs.h" | ||
| 8 | |||
| 9 | /* | ||
| 10 | * struct vmcs12 describes the state that our guest hypervisor (L1) keeps for a | ||
| 11 | * single nested guest (L2), hence the name vmcs12. Any VMX implementation has | ||
| 12 | * a VMCS structure, and vmcs12 is our emulated VMX's VMCS. This structure is | ||
| 13 | * stored in guest memory specified by VMPTRLD, but is opaque to the guest, | ||
| 14 | * which must access it using VMREAD/VMWRITE/VMCLEAR instructions. | ||
| 15 | * More than one of these structures may exist, if L1 runs multiple L2 guests. | ||
| 16 | * nested_vmx_run() will use the data here to build the vmcs02: a VMCS for the | ||
| 17 | * underlying hardware which will be used to run L2. | ||
| 18 | * This structure is packed to ensure that its layout is identical across | ||
| 19 | * machines (necessary for live migration). | ||
| 20 | * | ||
| 21 | * IMPORTANT: Changing the layout of existing fields in this structure | ||
| 22 | * will break save/restore compatibility with older kvm releases. When | ||
| 23 | * adding new fields, either use space in the reserved padding* arrays | ||
| 24 | * or add the new fields to the end of the structure. | ||
| 25 | */ | ||
| 26 | typedef u64 natural_width; | ||
| 27 | struct __packed vmcs12 { | ||
| 28 | /* According to the Intel spec, a VMCS region must start with the | ||
| 29 | * following two fields. Then follow implementation-specific data. | ||
| 30 | */ | ||
| 31 | struct vmcs_hdr hdr; | ||
| 32 | u32 abort; | ||
| 33 | |||
| 34 | u32 launch_state; /* set to 0 by VMCLEAR, to 1 by VMLAUNCH */ | ||
| 35 | u32 padding[7]; /* room for future expansion */ | ||
| 36 | |||
| 37 | u64 io_bitmap_a; | ||
| 38 | u64 io_bitmap_b; | ||
| 39 | u64 msr_bitmap; | ||
| 40 | u64 vm_exit_msr_store_addr; | ||
| 41 | u64 vm_exit_msr_load_addr; | ||
| 42 | u64 vm_entry_msr_load_addr; | ||
| 43 | u64 tsc_offset; | ||
| 44 | u64 virtual_apic_page_addr; | ||
| 45 | u64 apic_access_addr; | ||
| 46 | u64 posted_intr_desc_addr; | ||
| 47 | u64 ept_pointer; | ||
| 48 | u64 eoi_exit_bitmap0; | ||
| 49 | u64 eoi_exit_bitmap1; | ||
| 50 | u64 eoi_exit_bitmap2; | ||
| 51 | u64 eoi_exit_bitmap3; | ||
| 52 | u64 xss_exit_bitmap; | ||
| 53 | u64 guest_physical_address; | ||
| 54 | u64 vmcs_link_pointer; | ||
| 55 | u64 guest_ia32_debugctl; | ||
| 56 | u64 guest_ia32_pat; | ||
| 57 | u64 guest_ia32_efer; | ||
| 58 | u64 guest_ia32_perf_global_ctrl; | ||
| 59 | u64 guest_pdptr0; | ||
| 60 | u64 guest_pdptr1; | ||
| 61 | u64 guest_pdptr2; | ||
| 62 | u64 guest_pdptr3; | ||
| 63 | u64 guest_bndcfgs; | ||
| 64 | u64 host_ia32_pat; | ||
| 65 | u64 host_ia32_efer; | ||
| 66 | u64 host_ia32_perf_global_ctrl; | ||
| 67 | u64 vmread_bitmap; | ||
| 68 | u64 vmwrite_bitmap; | ||
| 69 | u64 vm_function_control; | ||
| 70 | u64 eptp_list_address; | ||
| 71 | u64 pml_address; | ||
| 72 | u64 padding64[3]; /* room for future expansion */ | ||
| 73 | /* | ||
| 74 | * To allow migration of L1 (complete with its L2 guests) between | ||
| 75 | * machines of different natural widths (32 or 64 bit), we cannot have | ||
| 76 | * unsigned long fields with no explicit size. We use u64 (aliased | ||
| 77 | * natural_width) instead. Luckily, x86 is little-endian. | ||
| 78 | */ | ||
| 79 | natural_width cr0_guest_host_mask; | ||
| 80 | natural_width cr4_guest_host_mask; | ||
| 81 | natural_width cr0_read_shadow; | ||
| 82 | natural_width cr4_read_shadow; | ||
| 83 | natural_width cr3_target_value0; | ||
| 84 | natural_width cr3_target_value1; | ||
| 85 | natural_width cr3_target_value2; | ||
| 86 | natural_width cr3_target_value3; | ||
| 87 | natural_width exit_qualification; | ||
| 88 | natural_width guest_linear_address; | ||
| 89 | natural_width guest_cr0; | ||
| 90 | natural_width guest_cr3; | ||
| 91 | natural_width guest_cr4; | ||
| 92 | natural_width guest_es_base; | ||
| 93 | natural_width guest_cs_base; | ||
| 94 | natural_width guest_ss_base; | ||
| 95 | natural_width guest_ds_base; | ||
| 96 | natural_width guest_fs_base; | ||
| 97 | natural_width guest_gs_base; | ||
| 98 | natural_width guest_ldtr_base; | ||
| 99 | natural_width guest_tr_base; | ||
| 100 | natural_width guest_gdtr_base; | ||
| 101 | natural_width guest_idtr_base; | ||
| 102 | natural_width guest_dr7; | ||
| 103 | natural_width guest_rsp; | ||
| 104 | natural_width guest_rip; | ||
| 105 | natural_width guest_rflags; | ||
| 106 | natural_width guest_pending_dbg_exceptions; | ||
| 107 | natural_width guest_sysenter_esp; | ||
| 108 | natural_width guest_sysenter_eip; | ||
| 109 | natural_width host_cr0; | ||
| 110 | natural_width host_cr3; | ||
| 111 | natural_width host_cr4; | ||
| 112 | natural_width host_fs_base; | ||
| 113 | natural_width host_gs_base; | ||
| 114 | natural_width host_tr_base; | ||
| 115 | natural_width host_gdtr_base; | ||
| 116 | natural_width host_idtr_base; | ||
| 117 | natural_width host_ia32_sysenter_esp; | ||
| 118 | natural_width host_ia32_sysenter_eip; | ||
| 119 | natural_width host_rsp; | ||
| 120 | natural_width host_rip; | ||
| 121 | natural_width paddingl[8]; /* room for future expansion */ | ||
| 122 | u32 pin_based_vm_exec_control; | ||
| 123 | u32 cpu_based_vm_exec_control; | ||
| 124 | u32 exception_bitmap; | ||
| 125 | u32 page_fault_error_code_mask; | ||
| 126 | u32 page_fault_error_code_match; | ||
| 127 | u32 cr3_target_count; | ||
| 128 | u32 vm_exit_controls; | ||
| 129 | u32 vm_exit_msr_store_count; | ||
| 130 | u32 vm_exit_msr_load_count; | ||
| 131 | u32 vm_entry_controls; | ||
| 132 | u32 vm_entry_msr_load_count; | ||
| 133 | u32 vm_entry_intr_info_field; | ||
| 134 | u32 vm_entry_exception_error_code; | ||
| 135 | u32 vm_entry_instruction_len; | ||
| 136 | u32 tpr_threshold; | ||
| 137 | u32 secondary_vm_exec_control; | ||
| 138 | u32 vm_instruction_error; | ||
| 139 | u32 vm_exit_reason; | ||
| 140 | u32 vm_exit_intr_info; | ||
| 141 | u32 vm_exit_intr_error_code; | ||
| 142 | u32 idt_vectoring_info_field; | ||
| 143 | u32 idt_vectoring_error_code; | ||
| 144 | u32 vm_exit_instruction_len; | ||
| 145 | u32 vmx_instruction_info; | ||
| 146 | u32 guest_es_limit; | ||
| 147 | u32 guest_cs_limit; | ||
| 148 | u32 guest_ss_limit; | ||
| 149 | u32 guest_ds_limit; | ||
| 150 | u32 guest_fs_limit; | ||
| 151 | u32 guest_gs_limit; | ||
| 152 | u32 guest_ldtr_limit; | ||
| 153 | u32 guest_tr_limit; | ||
| 154 | u32 guest_gdtr_limit; | ||
| 155 | u32 guest_idtr_limit; | ||
| 156 | u32 guest_es_ar_bytes; | ||
| 157 | u32 guest_cs_ar_bytes; | ||
| 158 | u32 guest_ss_ar_bytes; | ||
| 159 | u32 guest_ds_ar_bytes; | ||
| 160 | u32 guest_fs_ar_bytes; | ||
| 161 | u32 guest_gs_ar_bytes; | ||
| 162 | u32 guest_ldtr_ar_bytes; | ||
| 163 | u32 guest_tr_ar_bytes; | ||
| 164 | u32 guest_interruptibility_info; | ||
| 165 | u32 guest_activity_state; | ||
| 166 | u32 guest_sysenter_cs; | ||
| 167 | u32 host_ia32_sysenter_cs; | ||
| 168 | u32 vmx_preemption_timer_value; | ||
| 169 | u32 padding32[7]; /* room for future expansion */ | ||
| 170 | u16 virtual_processor_id; | ||
| 171 | u16 posted_intr_nv; | ||
| 172 | u16 guest_es_selector; | ||
| 173 | u16 guest_cs_selector; | ||
| 174 | u16 guest_ss_selector; | ||
| 175 | u16 guest_ds_selector; | ||
| 176 | u16 guest_fs_selector; | ||
| 177 | u16 guest_gs_selector; | ||
| 178 | u16 guest_ldtr_selector; | ||
| 179 | u16 guest_tr_selector; | ||
| 180 | u16 guest_intr_status; | ||
| 181 | u16 host_es_selector; | ||
| 182 | u16 host_cs_selector; | ||
| 183 | u16 host_ss_selector; | ||
| 184 | u16 host_ds_selector; | ||
| 185 | u16 host_fs_selector; | ||
| 186 | u16 host_gs_selector; | ||
| 187 | u16 host_tr_selector; | ||
| 188 | u16 guest_pml_index; | ||
| 189 | }; | ||
| 190 | |||
| 191 | /* | ||
| 192 | * VMCS12_REVISION is an arbitrary id that should be changed if the content or | ||
| 193 | * layout of struct vmcs12 is changed. MSR_IA32_VMX_BASIC returns this id, and | ||
| 194 | * VMPTRLD verifies that the VMCS region that L1 is loading contains this id. | ||
| 195 | * | ||
| 196 | * IMPORTANT: Changing this value will break save/restore compatibility with | ||
| 197 | * older kvm releases. | ||
| 198 | */ | ||
| 199 | #define VMCS12_REVISION 0x11e57ed0 | ||
| 200 | |||
| 201 | /* | ||
| 202 | * VMCS12_SIZE is the number of bytes L1 should allocate for the VMXON region | ||
| 203 | * and any VMCS region. Although only sizeof(struct vmcs12) are used by the | ||
| 204 | * current implementation, 4K are reserved to avoid future complications. | ||
| 205 | */ | ||
| 206 | #define VMCS12_SIZE 0x1000 | ||
| 207 | |||
| 208 | /* | ||
| 209 | * VMCS12_MAX_FIELD_INDEX is the highest index value used in any | ||
| 210 | * supported VMCS12 field encoding. | ||
| 211 | */ | ||
| 212 | #define VMCS12_MAX_FIELD_INDEX 0x17 | ||
| 213 | |||
| 214 | /* | ||
| 215 | * For save/restore compatibility, the vmcs12 field offsets must not change. | ||
| 216 | */ | ||
| 217 | #define CHECK_OFFSET(field, loc) \ | ||
| 218 | BUILD_BUG_ON_MSG(offsetof(struct vmcs12, field) != (loc), \ | ||
| 219 | "Offset of " #field " in struct vmcs12 has changed.") | ||
| 220 | |||
| 221 | static inline void vmx_check_vmcs12_offsets(void) | ||
| 222 | { | ||
| 223 | CHECK_OFFSET(hdr, 0); | ||
| 224 | CHECK_OFFSET(abort, 4); | ||
| 225 | CHECK_OFFSET(launch_state, 8); | ||
| 226 | CHECK_OFFSET(io_bitmap_a, 40); | ||
| 227 | CHECK_OFFSET(io_bitmap_b, 48); | ||
| 228 | CHECK_OFFSET(msr_bitmap, 56); | ||
| 229 | CHECK_OFFSET(vm_exit_msr_store_addr, 64); | ||
| 230 | CHECK_OFFSET(vm_exit_msr_load_addr, 72); | ||
| 231 | CHECK_OFFSET(vm_entry_msr_load_addr, 80); | ||
| 232 | CHECK_OFFSET(tsc_offset, 88); | ||
| 233 | CHECK_OFFSET(virtual_apic_page_addr, 96); | ||
| 234 | CHECK_OFFSET(apic_access_addr, 104); | ||
| 235 | CHECK_OFFSET(posted_intr_desc_addr, 112); | ||
| 236 | CHECK_OFFSET(ept_pointer, 120); | ||
| 237 | CHECK_OFFSET(eoi_exit_bitmap0, 128); | ||
| 238 | CHECK_OFFSET(eoi_exit_bitmap1, 136); | ||
| 239 | CHECK_OFFSET(eoi_exit_bitmap2, 144); | ||
| 240 | CHECK_OFFSET(eoi_exit_bitmap3, 152); | ||
| 241 | CHECK_OFFSET(xss_exit_bitmap, 160); | ||
| 242 | CHECK_OFFSET(guest_physical_address, 168); | ||
| 243 | CHECK_OFFSET(vmcs_link_pointer, 176); | ||
| 244 | CHECK_OFFSET(guest_ia32_debugctl, 184); | ||
| 245 | CHECK_OFFSET(guest_ia32_pat, 192); | ||
| 246 | CHECK_OFFSET(guest_ia32_efer, 200); | ||
| 247 | CHECK_OFFSET(guest_ia32_perf_global_ctrl, 208); | ||
| 248 | CHECK_OFFSET(guest_pdptr0, 216); | ||
| 249 | CHECK_OFFSET(guest_pdptr1, 224); | ||
| 250 | CHECK_OFFSET(guest_pdptr2, 232); | ||
| 251 | CHECK_OFFSET(guest_pdptr3, 240); | ||
| 252 | CHECK_OFFSET(guest_bndcfgs, 248); | ||
| 253 | CHECK_OFFSET(host_ia32_pat, 256); | ||
| 254 | CHECK_OFFSET(host_ia32_efer, 264); | ||
| 255 | CHECK_OFFSET(host_ia32_perf_global_ctrl, 272); | ||
| 256 | CHECK_OFFSET(vmread_bitmap, 280); | ||
| 257 | CHECK_OFFSET(vmwrite_bitmap, 288); | ||
| 258 | CHECK_OFFSET(vm_function_control, 296); | ||
| 259 | CHECK_OFFSET(eptp_list_address, 304); | ||
| 260 | CHECK_OFFSET(pml_address, 312); | ||
| 261 | CHECK_OFFSET(cr0_guest_host_mask, 344); | ||
| 262 | CHECK_OFFSET(cr4_guest_host_mask, 352); | ||
| 263 | CHECK_OFFSET(cr0_read_shadow, 360); | ||
| 264 | CHECK_OFFSET(cr4_read_shadow, 368); | ||
| 265 | CHECK_OFFSET(cr3_target_value0, 376); | ||
| 266 | CHECK_OFFSET(cr3_target_value1, 384); | ||
| 267 | CHECK_OFFSET(cr3_target_value2, 392); | ||
| 268 | CHECK_OFFSET(cr3_target_value3, 400); | ||
| 269 | CHECK_OFFSET(exit_qualification, 408); | ||
| 270 | CHECK_OFFSET(guest_linear_address, 416); | ||
| 271 | CHECK_OFFSET(guest_cr0, 424); | ||
| 272 | CHECK_OFFSET(guest_cr3, 432); | ||
| 273 | CHECK_OFFSET(guest_cr4, 440); | ||
| 274 | CHECK_OFFSET(guest_es_base, 448); | ||
| 275 | CHECK_OFFSET(guest_cs_base, 456); | ||
| 276 | CHECK_OFFSET(guest_ss_base, 464); | ||
| 277 | CHECK_OFFSET(guest_ds_base, 472); | ||
| 278 | CHECK_OFFSET(guest_fs_base, 480); | ||
| 279 | CHECK_OFFSET(guest_gs_base, 488); | ||
| 280 | CHECK_OFFSET(guest_ldtr_base, 496); | ||
| 281 | CHECK_OFFSET(guest_tr_base, 504); | ||
| 282 | CHECK_OFFSET(guest_gdtr_base, 512); | ||
| 283 | CHECK_OFFSET(guest_idtr_base, 520); | ||
| 284 | CHECK_OFFSET(guest_dr7, 528); | ||
| 285 | CHECK_OFFSET(guest_rsp, 536); | ||
| 286 | CHECK_OFFSET(guest_rip, 544); | ||
| 287 | CHECK_OFFSET(guest_rflags, 552); | ||
| 288 | CHECK_OFFSET(guest_pending_dbg_exceptions, 560); | ||
| 289 | CHECK_OFFSET(guest_sysenter_esp, 568); | ||
| 290 | CHECK_OFFSET(guest_sysenter_eip, 576); | ||
| 291 | CHECK_OFFSET(host_cr0, 584); | ||
| 292 | CHECK_OFFSET(host_cr3, 592); | ||
| 293 | CHECK_OFFSET(host_cr4, 600); | ||
| 294 | CHECK_OFFSET(host_fs_base, 608); | ||
| 295 | CHECK_OFFSET(host_gs_base, 616); | ||
| 296 | CHECK_OFFSET(host_tr_base, 624); | ||
| 297 | CHECK_OFFSET(host_gdtr_base, 632); | ||
| 298 | CHECK_OFFSET(host_idtr_base, 640); | ||
| 299 | CHECK_OFFSET(host_ia32_sysenter_esp, 648); | ||
| 300 | CHECK_OFFSET(host_ia32_sysenter_eip, 656); | ||
| 301 | CHECK_OFFSET(host_rsp, 664); | ||
| 302 | CHECK_OFFSET(host_rip, 672); | ||
| 303 | CHECK_OFFSET(pin_based_vm_exec_control, 744); | ||
| 304 | CHECK_OFFSET(cpu_based_vm_exec_control, 748); | ||
| 305 | CHECK_OFFSET(exception_bitmap, 752); | ||
| 306 | CHECK_OFFSET(page_fault_error_code_mask, 756); | ||
| 307 | CHECK_OFFSET(page_fault_error_code_match, 760); | ||
| 308 | CHECK_OFFSET(cr3_target_count, 764); | ||
| 309 | CHECK_OFFSET(vm_exit_controls, 768); | ||
| 310 | CHECK_OFFSET(vm_exit_msr_store_count, 772); | ||
| 311 | CHECK_OFFSET(vm_exit_msr_load_count, 776); | ||
| 312 | CHECK_OFFSET(vm_entry_controls, 780); | ||
| 313 | CHECK_OFFSET(vm_entry_msr_load_count, 784); | ||
| 314 | CHECK_OFFSET(vm_entry_intr_info_field, 788); | ||
| 315 | CHECK_OFFSET(vm_entry_exception_error_code, 792); | ||
| 316 | CHECK_OFFSET(vm_entry_instruction_len, 796); | ||
| 317 | CHECK_OFFSET(tpr_threshold, 800); | ||
| 318 | CHECK_OFFSET(secondary_vm_exec_control, 804); | ||
| 319 | CHECK_OFFSET(vm_instruction_error, 808); | ||
| 320 | CHECK_OFFSET(vm_exit_reason, 812); | ||
| 321 | CHECK_OFFSET(vm_exit_intr_info, 816); | ||
| 322 | CHECK_OFFSET(vm_exit_intr_error_code, 820); | ||
| 323 | CHECK_OFFSET(idt_vectoring_info_field, 824); | ||
| 324 | CHECK_OFFSET(idt_vectoring_error_code, 828); | ||
| 325 | CHECK_OFFSET(vm_exit_instruction_len, 832); | ||
| 326 | CHECK_OFFSET(vmx_instruction_info, 836); | ||
| 327 | CHECK_OFFSET(guest_es_limit, 840); | ||
| 328 | CHECK_OFFSET(guest_cs_limit, 844); | ||
| 329 | CHECK_OFFSET(guest_ss_limit, 848); | ||
| 330 | CHECK_OFFSET(guest_ds_limit, 852); | ||
| 331 | CHECK_OFFSET(guest_fs_limit, 856); | ||
| 332 | CHECK_OFFSET(guest_gs_limit, 860); | ||
| 333 | CHECK_OFFSET(guest_ldtr_limit, 864); | ||
| 334 | CHECK_OFFSET(guest_tr_limit, 868); | ||
| 335 | CHECK_OFFSET(guest_gdtr_limit, 872); | ||
| 336 | CHECK_OFFSET(guest_idtr_limit, 876); | ||
| 337 | CHECK_OFFSET(guest_es_ar_bytes, 880); | ||
| 338 | CHECK_OFFSET(guest_cs_ar_bytes, 884); | ||
| 339 | CHECK_OFFSET(guest_ss_ar_bytes, 888); | ||
| 340 | CHECK_OFFSET(guest_ds_ar_bytes, 892); | ||
| 341 | CHECK_OFFSET(guest_fs_ar_bytes, 896); | ||
| 342 | CHECK_OFFSET(guest_gs_ar_bytes, 900); | ||
| 343 | CHECK_OFFSET(guest_ldtr_ar_bytes, 904); | ||
| 344 | CHECK_OFFSET(guest_tr_ar_bytes, 908); | ||
| 345 | CHECK_OFFSET(guest_interruptibility_info, 912); | ||
| 346 | CHECK_OFFSET(guest_activity_state, 916); | ||
| 347 | CHECK_OFFSET(guest_sysenter_cs, 920); | ||
| 348 | CHECK_OFFSET(host_ia32_sysenter_cs, 924); | ||
| 349 | CHECK_OFFSET(vmx_preemption_timer_value, 928); | ||
| 350 | CHECK_OFFSET(virtual_processor_id, 960); | ||
| 351 | CHECK_OFFSET(posted_intr_nv, 962); | ||
| 352 | CHECK_OFFSET(guest_es_selector, 964); | ||
| 353 | CHECK_OFFSET(guest_cs_selector, 966); | ||
| 354 | CHECK_OFFSET(guest_ss_selector, 968); | ||
| 355 | CHECK_OFFSET(guest_ds_selector, 970); | ||
| 356 | CHECK_OFFSET(guest_fs_selector, 972); | ||
| 357 | CHECK_OFFSET(guest_gs_selector, 974); | ||
| 358 | CHECK_OFFSET(guest_ldtr_selector, 976); | ||
| 359 | CHECK_OFFSET(guest_tr_selector, 978); | ||
| 360 | CHECK_OFFSET(guest_intr_status, 980); | ||
| 361 | CHECK_OFFSET(host_es_selector, 982); | ||
| 362 | CHECK_OFFSET(host_cs_selector, 984); | ||
| 363 | CHECK_OFFSET(host_ss_selector, 986); | ||
| 364 | CHECK_OFFSET(host_ds_selector, 988); | ||
| 365 | CHECK_OFFSET(host_fs_selector, 990); | ||
| 366 | CHECK_OFFSET(host_gs_selector, 992); | ||
| 367 | CHECK_OFFSET(host_tr_selector, 994); | ||
| 368 | CHECK_OFFSET(guest_pml_index, 996); | ||
| 369 | } | ||
| 370 | |||
| 371 | extern const unsigned short vmcs_field_to_offset_table[]; | ||
| 372 | extern const unsigned int nr_vmcs12_fields; | ||
| 373 | |||
| 374 | #define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n))))) | ||
| 375 | |||
| 376 | static inline short vmcs_field_to_offset(unsigned long field) | ||
| 377 | { | ||
| 378 | unsigned short offset; | ||
| 379 | unsigned int index; | ||
| 380 | |||
| 381 | if (field >> 15) | ||
| 382 | return -ENOENT; | ||
| 383 | |||
| 384 | index = ROL16(field, 6); | ||
| 385 | if (index >= nr_vmcs12_fields) | ||
| 386 | return -ENOENT; | ||
| 387 | |||
| 388 | index = array_index_nospec(index, nr_vmcs12_fields); | ||
| 389 | offset = vmcs_field_to_offset_table[index]; | ||
| 390 | if (offset == 0) | ||
| 391 | return -ENOENT; | ||
| 392 | return offset; | ||
| 393 | } | ||
| 394 | |||
| 395 | #undef ROL16 | ||
| 396 | |||
| 397 | /* | ||
| 398 | * Read a vmcs12 field. Since these can have varying lengths and we return | ||
| 399 | * one type, we chose the biggest type (u64) and zero-extend the return value | ||
| 400 | * to that size. Note that the caller, handle_vmread, might need to use only | ||
| 401 | * some of the bits we return here (e.g., on 32-bit guests, only 32 bits of | ||
| 402 | * 64-bit fields are to be returned). | ||
| 403 | */ | ||
| 404 | static inline int vmcs12_read_any(struct vmcs12 *vmcs12, | ||
| 405 | unsigned long field, u64 *ret) | ||
| 406 | { | ||
| 407 | short offset = vmcs_field_to_offset(field); | ||
| 408 | char *p; | ||
| 409 | |||
| 410 | if (offset < 0) | ||
| 411 | return offset; | ||
| 412 | |||
| 413 | p = (char *)vmcs12 + offset; | ||
| 414 | |||
| 415 | switch (vmcs_field_width(field)) { | ||
| 416 | case VMCS_FIELD_WIDTH_NATURAL_WIDTH: | ||
| 417 | *ret = *((natural_width *)p); | ||
| 418 | return 0; | ||
| 419 | case VMCS_FIELD_WIDTH_U16: | ||
| 420 | *ret = *((u16 *)p); | ||
| 421 | return 0; | ||
| 422 | case VMCS_FIELD_WIDTH_U32: | ||
| 423 | *ret = *((u32 *)p); | ||
| 424 | return 0; | ||
| 425 | case VMCS_FIELD_WIDTH_U64: | ||
| 426 | *ret = *((u64 *)p); | ||
| 427 | return 0; | ||
| 428 | default: | ||
| 429 | WARN_ON(1); | ||
| 430 | return -ENOENT; | ||
| 431 | } | ||
| 432 | } | ||
| 433 | |||
| 434 | static inline int vmcs12_write_any(struct vmcs12 *vmcs12, | ||
| 435 | unsigned long field, u64 field_value){ | ||
| 436 | short offset = vmcs_field_to_offset(field); | ||
| 437 | char *p = (char *)vmcs12 + offset; | ||
| 438 | |||
| 439 | if (offset < 0) | ||
| 440 | return offset; | ||
| 441 | |||
| 442 | switch (vmcs_field_width(field)) { | ||
| 443 | case VMCS_FIELD_WIDTH_U16: | ||
| 444 | *(u16 *)p = field_value; | ||
| 445 | return 0; | ||
| 446 | case VMCS_FIELD_WIDTH_U32: | ||
| 447 | *(u32 *)p = field_value; | ||
| 448 | return 0; | ||
| 449 | case VMCS_FIELD_WIDTH_U64: | ||
| 450 | *(u64 *)p = field_value; | ||
| 451 | return 0; | ||
| 452 | case VMCS_FIELD_WIDTH_NATURAL_WIDTH: | ||
| 453 | *(natural_width *)p = field_value; | ||
| 454 | return 0; | ||
| 455 | default: | ||
| 456 | WARN_ON(1); | ||
| 457 | return -ENOENT; | ||
| 458 | } | ||
| 459 | |||
| 460 | } | ||
| 461 | |||
| 462 | #endif /* __KVM_X86_VMX_VMCS12_H */ | ||
diff --git a/arch/x86/kvm/vmx_shadow_fields.h b/arch/x86/kvm/vmx/vmcs_shadow_fields.h index 132432f375c2..132432f375c2 100644 --- a/arch/x86/kvm/vmx_shadow_fields.h +++ b/arch/x86/kvm/vmx/vmcs_shadow_fields.h | |||
diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S new file mode 100644 index 000000000000..bcef2c7e9bc4 --- /dev/null +++ b/arch/x86/kvm/vmx/vmenter.S | |||
| @@ -0,0 +1,57 @@ | |||
| 1 | /* SPDX-License-Identifier: GPL-2.0 */ | ||
| 2 | #include <linux/linkage.h> | ||
| 3 | #include <asm/asm.h> | ||
| 4 | |||
| 5 | .text | ||
| 6 | |||
| 7 | /** | ||
| 8 | * vmx_vmenter - VM-Enter the current loaded VMCS | ||
| 9 | * | ||
| 10 | * %RFLAGS.ZF: !VMCS.LAUNCHED, i.e. controls VMLAUNCH vs. VMRESUME | ||
| 11 | * | ||
| 12 | * Returns: | ||
| 13 | * %RFLAGS.CF is set on VM-Fail Invalid | ||
| 14 | * %RFLAGS.ZF is set on VM-Fail Valid | ||
| 15 | * %RFLAGS.{CF,ZF} are cleared on VM-Success, i.e. VM-Exit | ||
| 16 | * | ||
| 17 | * Note that VMRESUME/VMLAUNCH fall-through and return directly if | ||
| 18 | * they VM-Fail, whereas a successful VM-Enter + VM-Exit will jump | ||
| 19 | * to vmx_vmexit. | ||
| 20 | */ | ||
| 21 | ENTRY(vmx_vmenter) | ||
| 22 | /* EFLAGS.ZF is set if VMCS.LAUNCHED == 0 */ | ||
| 23 | je 2f | ||
| 24 | |||
| 25 | 1: vmresume | ||
| 26 | ret | ||
| 27 | |||
| 28 | 2: vmlaunch | ||
| 29 | ret | ||
| 30 | |||
| 31 | 3: cmpb $0, kvm_rebooting | ||
| 32 | jne 4f | ||
| 33 | call kvm_spurious_fault | ||
| 34 | 4: ret | ||
| 35 | |||
| 36 | .pushsection .fixup, "ax" | ||
| 37 | 5: jmp 3b | ||
| 38 | .popsection | ||
| 39 | |||
| 40 | _ASM_EXTABLE(1b, 5b) | ||
| 41 | _ASM_EXTABLE(2b, 5b) | ||
| 42 | |||
| 43 | ENDPROC(vmx_vmenter) | ||
| 44 | |||
| 45 | /** | ||
| 46 | * vmx_vmexit - Handle a VMX VM-Exit | ||
| 47 | * | ||
| 48 | * Returns: | ||
| 49 | * %RFLAGS.{CF,ZF} are cleared on VM-Success, i.e. VM-Exit | ||
| 50 | * | ||
| 51 | * This is vmx_vmenter's partner in crime. On a VM-Exit, control will jump | ||
| 52 | * here after hardware loads the host's state, i.e. this is the destination | ||
| 53 | * referred to by VMCS.HOST_RIP. | ||
| 54 | */ | ||
| 55 | ENTRY(vmx_vmexit) | ||
| 56 | ret | ||
| 57 | ENDPROC(vmx_vmexit) | ||
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c new file mode 100644 index 000000000000..41d6f7081ff7 --- /dev/null +++ b/arch/x86/kvm/vmx/vmx.c | |||
| @@ -0,0 +1,7935 @@ | |||
| 1 | /* | ||
| 2 | * Kernel-based Virtual Machine driver for Linux | ||
| 3 | * | ||
| 4 | * This module enables machines with Intel VT-x extensions to run virtual | ||
| 5 | * machines without emulation or binary translation. | ||
| 6 | * | ||
| 7 | * Copyright (C) 2006 Qumranet, Inc. | ||
| 8 | * Copyright 2010 Red Hat, Inc. and/or its affiliates. | ||
| 9 | * | ||
| 10 | * Authors: | ||
| 11 | * Avi Kivity <avi@qumranet.com> | ||
| 12 | * Yaniv Kamay <yaniv@qumranet.com> | ||
| 13 | * | ||
| 14 | * This work is licensed under the terms of the GNU GPL, version 2. See | ||
| 15 | * the COPYING file in the top-level directory. | ||
| 16 | * | ||
| 17 | */ | ||
| 18 | |||
| 19 | #include <linux/frame.h> | ||
| 20 | #include <linux/highmem.h> | ||
| 21 | #include <linux/hrtimer.h> | ||
| 22 | #include <linux/kernel.h> | ||
| 23 | #include <linux/kvm_host.h> | ||
| 24 | #include <linux/module.h> | ||
| 25 | #include <linux/moduleparam.h> | ||
| 26 | #include <linux/mod_devicetable.h> | ||
| 27 | #include <linux/mm.h> | ||
| 28 | #include <linux/sched.h> | ||
| 29 | #include <linux/slab.h> | ||
| 30 | #include <linux/tboot.h> | ||
| 31 | #include <linux/trace_events.h> | ||
| 32 | |||
| 33 | #include <asm/apic.h> | ||
| 34 | #include <asm/asm.h> | ||
| 35 | #include <asm/cpu.h> | ||
| 36 | #include <asm/debugreg.h> | ||
| 37 | #include <asm/desc.h> | ||
| 38 | #include <asm/fpu/internal.h> | ||
| 39 | #include <asm/io.h> | ||
| 40 | #include <asm/irq_remapping.h> | ||
| 41 | #include <asm/kexec.h> | ||
| 42 | #include <asm/perf_event.h> | ||
| 43 | #include <asm/mce.h> | ||
| 44 | #include <asm/mmu_context.h> | ||
| 45 | #include <asm/mshyperv.h> | ||
| 46 | #include <asm/spec-ctrl.h> | ||
| 47 | #include <asm/virtext.h> | ||
| 48 | #include <asm/vmx.h> | ||
| 49 | |||
| 50 | #include "capabilities.h" | ||
| 51 | #include "cpuid.h" | ||
| 52 | #include "evmcs.h" | ||
| 53 | #include "irq.h" | ||
| 54 | #include "kvm_cache_regs.h" | ||
| 55 | #include "lapic.h" | ||
| 56 | #include "mmu.h" | ||
| 57 | #include "nested.h" | ||
| 58 | #include "ops.h" | ||
| 59 | #include "pmu.h" | ||
| 60 | #include "trace.h" | ||
| 61 | #include "vmcs.h" | ||
| 62 | #include "vmcs12.h" | ||
| 63 | #include "vmx.h" | ||
| 64 | #include "x86.h" | ||
| 65 | |||
| 66 | MODULE_AUTHOR("Qumranet"); | ||
| 67 | MODULE_LICENSE("GPL"); | ||
| 68 | |||
| 69 | static const struct x86_cpu_id vmx_cpu_id[] = { | ||
| 70 | X86_FEATURE_MATCH(X86_FEATURE_VMX), | ||
| 71 | {} | ||
| 72 | }; | ||
| 73 | MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id); | ||
| 74 | |||
| 75 | bool __read_mostly enable_vpid = 1; | ||
| 76 | module_param_named(vpid, enable_vpid, bool, 0444); | ||
| 77 | |||
| 78 | static bool __read_mostly enable_vnmi = 1; | ||
| 79 | module_param_named(vnmi, enable_vnmi, bool, S_IRUGO); | ||
| 80 | |||
| 81 | bool __read_mostly flexpriority_enabled = 1; | ||
| 82 | module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO); | ||
| 83 | |||
| 84 | bool __read_mostly enable_ept = 1; | ||
| 85 | module_param_named(ept, enable_ept, bool, S_IRUGO); | ||
| 86 | |||
| 87 | bool __read_mostly enable_unrestricted_guest = 1; | ||
| 88 | module_param_named(unrestricted_guest, | ||
| 89 | enable_unrestricted_guest, bool, S_IRUGO); | ||
| 90 | |||
| 91 | bool __read_mostly enable_ept_ad_bits = 1; | ||
| 92 | module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO); | ||
| 93 | |||
| 94 | static bool __read_mostly emulate_invalid_guest_state = true; | ||
| 95 | module_param(emulate_invalid_guest_state, bool, S_IRUGO); | ||
| 96 | |||
| 97 | static bool __read_mostly fasteoi = 1; | ||
| 98 | module_param(fasteoi, bool, S_IRUGO); | ||
| 99 | |||
| 100 | static bool __read_mostly enable_apicv = 1; | ||
| 101 | module_param(enable_apicv, bool, S_IRUGO); | ||
| 102 | |||
| 103 | /* | ||
| 104 | * If nested=1, nested virtualization is supported, i.e., guests may use | ||
| 105 | * VMX and be a hypervisor for its own guests. If nested=0, guests may not | ||
| 106 | * use VMX instructions. | ||
| 107 | */ | ||
| 108 | static bool __read_mostly nested = 1; | ||
| 109 | module_param(nested, bool, S_IRUGO); | ||
| 110 | |||
| 111 | static u64 __read_mostly host_xss; | ||
| 112 | |||
| 113 | bool __read_mostly enable_pml = 1; | ||
| 114 | module_param_named(pml, enable_pml, bool, S_IRUGO); | ||
| 115 | |||
| 116 | #define MSR_BITMAP_MODE_X2APIC 1 | ||
| 117 | #define MSR_BITMAP_MODE_X2APIC_APICV 2 | ||
| 118 | |||
| 119 | #define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL | ||
| 120 | |||
| 121 | /* Guest_tsc -> host_tsc conversion requires 64-bit division. */ | ||
| 122 | static int __read_mostly cpu_preemption_timer_multi; | ||
| 123 | static bool __read_mostly enable_preemption_timer = 1; | ||
| 124 | #ifdef CONFIG_X86_64 | ||
| 125 | module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO); | ||
| 126 | #endif | ||
| 127 | |||
| 128 | #define KVM_VM_CR0_ALWAYS_OFF (X86_CR0_NW | X86_CR0_CD) | ||
| 129 | #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE | ||
| 130 | #define KVM_VM_CR0_ALWAYS_ON \ | ||
| 131 | (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | \ | ||
| 132 | X86_CR0_WP | X86_CR0_PG | X86_CR0_PE) | ||
| 133 | #define KVM_CR4_GUEST_OWNED_BITS \ | ||
| 134 | (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \ | ||
| 135 | | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_TSD) | ||
| 136 | |||
| 137 | #define KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR4_VMXE | ||
| 138 | #define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE) | ||
| 139 | #define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE) | ||
| 140 | |||
| 141 | #define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM)) | ||
| 142 | |||
| 143 | #define MSR_IA32_RTIT_STATUS_MASK (~(RTIT_STATUS_FILTEREN | \ | ||
| 144 | RTIT_STATUS_CONTEXTEN | RTIT_STATUS_TRIGGEREN | \ | ||
| 145 | RTIT_STATUS_ERROR | RTIT_STATUS_STOPPED | \ | ||
| 146 | RTIT_STATUS_BYTECNT)) | ||
| 147 | |||
| 148 | #define MSR_IA32_RTIT_OUTPUT_BASE_MASK \ | ||
| 149 | (~((1UL << cpuid_query_maxphyaddr(vcpu)) - 1) | 0x7f) | ||
| 150 | |||
| 151 | /* | ||
| 152 | * These 2 parameters are used to config the controls for Pause-Loop Exiting: | ||
| 153 | * ple_gap: upper bound on the amount of time between two successive | ||
| 154 | * executions of PAUSE in a loop. Also indicate if ple enabled. | ||
| 155 | * According to test, this time is usually smaller than 128 cycles. | ||
| 156 | * ple_window: upper bound on the amount of time a guest is allowed to execute | ||
| 157 | * in a PAUSE loop. Tests indicate that most spinlocks are held for | ||
| 158 | * less than 2^12 cycles | ||
| 159 | * Time is measured based on a counter that runs at the same rate as the TSC, | ||
| 160 | * refer SDM volume 3b section 21.6.13 & 22.1.3. | ||
| 161 | */ | ||
| 162 | static unsigned int ple_gap = KVM_DEFAULT_PLE_GAP; | ||
| 163 | module_param(ple_gap, uint, 0444); | ||
| 164 | |||
| 165 | static unsigned int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW; | ||
| 166 | module_param(ple_window, uint, 0444); | ||
| 167 | |||
| 168 | /* Default doubles per-vcpu window every exit. */ | ||
| 169 | static unsigned int ple_window_grow = KVM_DEFAULT_PLE_WINDOW_GROW; | ||
| 170 | module_param(ple_window_grow, uint, 0444); | ||
| 171 | |||
| 172 | /* Default resets per-vcpu window every exit to ple_window. */ | ||
| 173 | static unsigned int ple_window_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK; | ||
| 174 | module_param(ple_window_shrink, uint, 0444); | ||
| 175 | |||
| 176 | /* Default is to compute the maximum so we can never overflow. */ | ||
| 177 | static unsigned int ple_window_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX; | ||
| 178 | module_param(ple_window_max, uint, 0444); | ||
| 179 | |||
| 180 | /* Default is SYSTEM mode, 1 for host-guest mode */ | ||
| 181 | int __read_mostly pt_mode = PT_MODE_SYSTEM; | ||
| 182 | module_param(pt_mode, int, S_IRUGO); | ||
| 183 | |||
| 184 | static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush); | ||
| 185 | static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond); | ||
| 186 | static DEFINE_MUTEX(vmx_l1d_flush_mutex); | ||
| 187 | |||
| 188 | /* Storage for pre module init parameter parsing */ | ||
| 189 | static enum vmx_l1d_flush_state __read_mostly vmentry_l1d_flush_param = VMENTER_L1D_FLUSH_AUTO; | ||
| 190 | |||
| 191 | static const struct { | ||
| 192 | const char *option; | ||
| 193 | bool for_parse; | ||
| 194 | } vmentry_l1d_param[] = { | ||
| 195 | [VMENTER_L1D_FLUSH_AUTO] = {"auto", true}, | ||
| 196 | [VMENTER_L1D_FLUSH_NEVER] = {"never", true}, | ||
| 197 | [VMENTER_L1D_FLUSH_COND] = {"cond", true}, | ||
| 198 | [VMENTER_L1D_FLUSH_ALWAYS] = {"always", true}, | ||
| 199 | [VMENTER_L1D_FLUSH_EPT_DISABLED] = {"EPT disabled", false}, | ||
| 200 | [VMENTER_L1D_FLUSH_NOT_REQUIRED] = {"not required", false}, | ||
| 201 | }; | ||
| 202 | |||
| 203 | #define L1D_CACHE_ORDER 4 | ||
| 204 | static void *vmx_l1d_flush_pages; | ||
| 205 | |||
| 206 | static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf) | ||
| 207 | { | ||
| 208 | struct page *page; | ||
| 209 | unsigned int i; | ||
| 210 | |||
| 211 | if (!enable_ept) { | ||
| 212 | l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED; | ||
| 213 | return 0; | ||
| 214 | } | ||
| 215 | |||
| 216 | if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) { | ||
| 217 | u64 msr; | ||
| 218 | |||
| 219 | rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr); | ||
| 220 | if (msr & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) { | ||
| 221 | l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED; | ||
| 222 | return 0; | ||
| 223 | } | ||
| 224 | } | ||
| 225 | |||
| 226 | /* If set to auto use the default l1tf mitigation method */ | ||
| 227 | if (l1tf == VMENTER_L1D_FLUSH_AUTO) { | ||
| 228 | switch (l1tf_mitigation) { | ||
| 229 | case L1TF_MITIGATION_OFF: | ||
| 230 | l1tf = VMENTER_L1D_FLUSH_NEVER; | ||
| 231 | break; | ||
| 232 | case L1TF_MITIGATION_FLUSH_NOWARN: | ||
| 233 | case L1TF_MITIGATION_FLUSH: | ||
| 234 | case L1TF_MITIGATION_FLUSH_NOSMT: | ||
| 235 | l1tf = VMENTER_L1D_FLUSH_COND; | ||
| 236 | break; | ||
| 237 | case L1TF_MITIGATION_FULL: | ||
| 238 | case L1TF_MITIGATION_FULL_FORCE: | ||
| 239 | l1tf = VMENTER_L1D_FLUSH_ALWAYS; | ||
| 240 | break; | ||
| 241 | } | ||
| 242 | } else if (l1tf_mitigation == L1TF_MITIGATION_FULL_FORCE) { | ||
| 243 | l1tf = VMENTER_L1D_FLUSH_ALWAYS; | ||
| 244 | } | ||
| 245 | |||
| 246 | if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages && | ||
| 247 | !boot_cpu_has(X86_FEATURE_FLUSH_L1D)) { | ||
| 248 | page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER); | ||
| 249 | if (!page) | ||
| 250 | return -ENOMEM; | ||
| 251 | vmx_l1d_flush_pages = page_address(page); | ||
| 252 | |||
| 253 | /* | ||
| 254 | * Initialize each page with a different pattern in | ||
| 255 | * order to protect against KSM in the nested | ||
| 256 | * virtualization case. | ||
| 257 | */ | ||
| 258 | for (i = 0; i < 1u << L1D_CACHE_ORDER; ++i) { | ||
| 259 | memset(vmx_l1d_flush_pages + i * PAGE_SIZE, i + 1, | ||
| 260 | PAGE_SIZE); | ||
| 261 | } | ||
| 262 | } | ||
| 263 | |||
| 264 | l1tf_vmx_mitigation = l1tf; | ||
| 265 | |||
| 266 | if (l1tf != VMENTER_L1D_FLUSH_NEVER) | ||
| 267 | static_branch_enable(&vmx_l1d_should_flush); | ||
| 268 | else | ||
| 269 | static_branch_disable(&vmx_l1d_should_flush); | ||
| 270 | |||
| 271 | if (l1tf == VMENTER_L1D_FLUSH_COND) | ||
| 272 | static_branch_enable(&vmx_l1d_flush_cond); | ||
| 273 | else | ||
| 274 | static_branch_disable(&vmx_l1d_flush_cond); | ||
| 275 | return 0; | ||
| 276 | } | ||
| 277 | |||
| 278 | static int vmentry_l1d_flush_parse(const char *s) | ||
| 279 | { | ||
| 280 | unsigned int i; | ||
| 281 | |||
| 282 | if (s) { | ||
| 283 | for (i = 0; i < ARRAY_SIZE(vmentry_l1d_param); i++) { | ||
| 284 | if (vmentry_l1d_param[i].for_parse && | ||
| 285 | sysfs_streq(s, vmentry_l1d_param[i].option)) | ||
| 286 | return i; | ||
| 287 | } | ||
| 288 | } | ||
| 289 | return -EINVAL; | ||
| 290 | } | ||
| 291 | |||
| 292 | static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp) | ||
| 293 | { | ||
| 294 | int l1tf, ret; | ||
| 295 | |||
| 296 | l1tf = vmentry_l1d_flush_parse(s); | ||
| 297 | if (l1tf < 0) | ||
| 298 | return l1tf; | ||
| 299 | |||
| 300 | if (!boot_cpu_has(X86_BUG_L1TF)) | ||
| 301 | return 0; | ||
| 302 | |||
| 303 | /* | ||
| 304 | * Has vmx_init() run already? If not then this is the pre init | ||
| 305 | * parameter parsing. In that case just store the value and let | ||
| 306 | * vmx_init() do the proper setup after enable_ept has been | ||
| 307 | * established. | ||
| 308 | */ | ||
| 309 | if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO) { | ||
| 310 | vmentry_l1d_flush_param = l1tf; | ||
| 311 | return 0; | ||
| 312 | } | ||
| 313 | |||
| 314 | mutex_lock(&vmx_l1d_flush_mutex); | ||
| 315 | ret = vmx_setup_l1d_flush(l1tf); | ||
| 316 | mutex_unlock(&vmx_l1d_flush_mutex); | ||
| 317 | return ret; | ||
| 318 | } | ||
| 319 | |||
| 320 | static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp) | ||
| 321 | { | ||
| 322 | if (WARN_ON_ONCE(l1tf_vmx_mitigation >= ARRAY_SIZE(vmentry_l1d_param))) | ||
| 323 | return sprintf(s, "???\n"); | ||
| 324 | |||
| 325 | return sprintf(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option); | ||
| 326 | } | ||
| 327 | |||
| 328 | static const struct kernel_param_ops vmentry_l1d_flush_ops = { | ||
| 329 | .set = vmentry_l1d_flush_set, | ||
| 330 | .get = vmentry_l1d_flush_get, | ||
| 331 | }; | ||
| 332 | module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644); | ||
| 333 | |||
| 334 | static bool guest_state_valid(struct kvm_vcpu *vcpu); | ||
| 335 | static u32 vmx_segment_access_rights(struct kvm_segment *var); | ||
| 336 | static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, | ||
| 337 | u32 msr, int type); | ||
| 338 | |||
| 339 | void vmx_vmexit(void); | ||
| 340 | |||
| 341 | static DEFINE_PER_CPU(struct vmcs *, vmxarea); | ||
| 342 | DEFINE_PER_CPU(struct vmcs *, current_vmcs); | ||
| 343 | /* | ||
| 344 | * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed | ||
| 345 | * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it. | ||
| 346 | */ | ||
| 347 | static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu); | ||
| 348 | |||
| 349 | /* | ||
| 350 | * We maintian a per-CPU linked-list of vCPU, so in wakeup_handler() we | ||
| 351 | * can find which vCPU should be waken up. | ||
| 352 | */ | ||
| 353 | static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu); | ||
| 354 | static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock); | ||
| 355 | |||
| 356 | static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS); | ||
| 357 | static DEFINE_SPINLOCK(vmx_vpid_lock); | ||
| 358 | |||
| 359 | struct vmcs_config vmcs_config; | ||
| 360 | struct vmx_capability vmx_capability; | ||
| 361 | |||
| 362 | #define VMX_SEGMENT_FIELD(seg) \ | ||
| 363 | [VCPU_SREG_##seg] = { \ | ||
| 364 | .selector = GUEST_##seg##_SELECTOR, \ | ||
| 365 | .base = GUEST_##seg##_BASE, \ | ||
| 366 | .limit = GUEST_##seg##_LIMIT, \ | ||
| 367 | .ar_bytes = GUEST_##seg##_AR_BYTES, \ | ||
| 368 | } | ||
| 369 | |||
| 370 | static const struct kvm_vmx_segment_field { | ||
| 371 | unsigned selector; | ||
| 372 | unsigned base; | ||
| 373 | unsigned limit; | ||
| 374 | unsigned ar_bytes; | ||
| 375 | } kvm_vmx_segment_fields[] = { | ||
| 376 | VMX_SEGMENT_FIELD(CS), | ||
| 377 | VMX_SEGMENT_FIELD(DS), | ||
| 378 | VMX_SEGMENT_FIELD(ES), | ||
| 379 | VMX_SEGMENT_FIELD(FS), | ||
| 380 | VMX_SEGMENT_FIELD(GS), | ||
| 381 | VMX_SEGMENT_FIELD(SS), | ||
| 382 | VMX_SEGMENT_FIELD(TR), | ||
| 383 | VMX_SEGMENT_FIELD(LDTR), | ||
| 384 | }; | ||
| 385 | |||
| 386 | u64 host_efer; | ||
| 387 | |||
| 388 | /* | ||
| 389 | * Though SYSCALL is only supported in 64-bit mode on Intel CPUs, kvm | ||
| 390 | * will emulate SYSCALL in legacy mode if the vendor string in guest | ||
| 391 | * CPUID.0:{EBX,ECX,EDX} is "AuthenticAMD" or "AMDisbetter!" To | ||
| 392 | * support this emulation, IA32_STAR must always be included in | ||
| 393 | * vmx_msr_index[], even in i386 builds. | ||
| 394 | */ | ||
| 395 | const u32 vmx_msr_index[] = { | ||
| 396 | #ifdef CONFIG_X86_64 | ||
| 397 | MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, | ||
| 398 | #endif | ||
| 399 | MSR_EFER, MSR_TSC_AUX, MSR_STAR, | ||
| 400 | }; | ||
| 401 | |||
| 402 | #if IS_ENABLED(CONFIG_HYPERV) | ||
| 403 | static bool __read_mostly enlightened_vmcs = true; | ||
| 404 | module_param(enlightened_vmcs, bool, 0444); | ||
| 405 | |||
| 406 | /* check_ept_pointer() should be under protection of ept_pointer_lock. */ | ||
| 407 | static void check_ept_pointer_match(struct kvm *kvm) | ||
| 408 | { | ||
| 409 | struct kvm_vcpu *vcpu; | ||
| 410 | u64 tmp_eptp = INVALID_PAGE; | ||
| 411 | int i; | ||
| 412 | |||
| 413 | kvm_for_each_vcpu(i, vcpu, kvm) { | ||
| 414 | if (!VALID_PAGE(tmp_eptp)) { | ||
| 415 | tmp_eptp = to_vmx(vcpu)->ept_pointer; | ||
| 416 | } else if (tmp_eptp != to_vmx(vcpu)->ept_pointer) { | ||
| 417 | to_kvm_vmx(kvm)->ept_pointers_match | ||
| 418 | = EPT_POINTERS_MISMATCH; | ||
| 419 | return; | ||
| 420 | } | ||
| 421 | } | ||
| 422 | |||
| 423 | to_kvm_vmx(kvm)->ept_pointers_match = EPT_POINTERS_MATCH; | ||
| 424 | } | ||
| 425 | |||
| 426 | int kvm_fill_hv_flush_list_func(struct hv_guest_mapping_flush_list *flush, | ||
| 427 | void *data) | ||
| 428 | { | ||
| 429 | struct kvm_tlb_range *range = data; | ||
| 430 | |||
| 431 | return hyperv_fill_flush_guest_mapping_list(flush, range->start_gfn, | ||
| 432 | range->pages); | ||
| 433 | } | ||
| 434 | |||
| 435 | static inline int __hv_remote_flush_tlb_with_range(struct kvm *kvm, | ||
| 436 | struct kvm_vcpu *vcpu, struct kvm_tlb_range *range) | ||
| 437 | { | ||
| 438 | u64 ept_pointer = to_vmx(vcpu)->ept_pointer; | ||
| 439 | |||
| 440 | /* | ||
| 441 | * FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE hypercall needs address | ||
| 442 | * of the base of EPT PML4 table, strip off EPT configuration | ||
| 443 | * information. | ||
| 444 | */ | ||
| 445 | if (range) | ||
| 446 | return hyperv_flush_guest_mapping_range(ept_pointer & PAGE_MASK, | ||
| 447 | kvm_fill_hv_flush_list_func, (void *)range); | ||
| 448 | else | ||
| 449 | return hyperv_flush_guest_mapping(ept_pointer & PAGE_MASK); | ||
| 450 | } | ||
| 451 | |||
| 452 | static int hv_remote_flush_tlb_with_range(struct kvm *kvm, | ||
| 453 | struct kvm_tlb_range *range) | ||
| 454 | { | ||
| 455 | struct kvm_vcpu *vcpu; | ||
| 456 | int ret = -ENOTSUPP, i; | ||
| 457 | |||
| 458 | spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock); | ||
| 459 | |||
| 460 | if (to_kvm_vmx(kvm)->ept_pointers_match == EPT_POINTERS_CHECK) | ||
| 461 | check_ept_pointer_match(kvm); | ||
| 462 | |||
| 463 | if (to_kvm_vmx(kvm)->ept_pointers_match != EPT_POINTERS_MATCH) { | ||
| 464 | kvm_for_each_vcpu(i, vcpu, kvm) { | ||
| 465 | /* If ept_pointer is invalid pointer, bypass flush request. */ | ||
| 466 | if (VALID_PAGE(to_vmx(vcpu)->ept_pointer)) | ||
| 467 | ret |= __hv_remote_flush_tlb_with_range( | ||
| 468 | kvm, vcpu, range); | ||
| 469 | } | ||
| 470 | } else { | ||
| 471 | ret = __hv_remote_flush_tlb_with_range(kvm, | ||
| 472 | kvm_get_vcpu(kvm, 0), range); | ||
| 473 | } | ||
| 474 | |||
| 475 | spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock); | ||
| 476 | return ret; | ||
| 477 | } | ||
| 478 | static int hv_remote_flush_tlb(struct kvm *kvm) | ||
| 479 | { | ||
| 480 | return hv_remote_flush_tlb_with_range(kvm, NULL); | ||
| 481 | } | ||
| 482 | |||
| 483 | #endif /* IS_ENABLED(CONFIG_HYPERV) */ | ||
| 484 | |||
| 485 | /* | ||
| 486 | * Comment's format: document - errata name - stepping - processor name. | ||
| 487 | * Refer from | ||
| 488 | * https://www.virtualbox.org/svn/vbox/trunk/src/VBox/VMM/VMMR0/HMR0.cpp | ||
| 489 | */ | ||
| 490 | static u32 vmx_preemption_cpu_tfms[] = { | ||
| 491 | /* 323344.pdf - BA86 - D0 - Xeon 7500 Series */ | ||
| 492 | 0x000206E6, | ||
| 493 | /* 323056.pdf - AAX65 - C2 - Xeon L3406 */ | ||
| 494 | /* 322814.pdf - AAT59 - C2 - i7-600, i5-500, i5-400 and i3-300 Mobile */ | ||
| 495 | /* 322911.pdf - AAU65 - C2 - i5-600, i3-500 Desktop and Pentium G6950 */ | ||
| 496 | 0x00020652, | ||
| 497 | /* 322911.pdf - AAU65 - K0 - i5-600, i3-500 Desktop and Pentium G6950 */ | ||
| 498 | 0x00020655, | ||
| 499 | /* 322373.pdf - AAO95 - B1 - Xeon 3400 Series */ | ||
| 500 | /* 322166.pdf - AAN92 - B1 - i7-800 and i5-700 Desktop */ | ||
| 501 | /* | ||
| 502 | * 320767.pdf - AAP86 - B1 - | ||
| 503 | * i7-900 Mobile Extreme, i7-800 and i7-700 Mobile | ||
| 504 | */ | ||
| 505 | 0x000106E5, | ||
| 506 | /* 321333.pdf - AAM126 - C0 - Xeon 3500 */ | ||
| 507 | 0x000106A0, | ||
| 508 | /* 321333.pdf - AAM126 - C1 - Xeon 3500 */ | ||
| 509 | 0x000106A1, | ||
| 510 | /* 320836.pdf - AAJ124 - C0 - i7-900 Desktop Extreme and i7-900 Desktop */ | ||
| 511 | 0x000106A4, | ||
| 512 | /* 321333.pdf - AAM126 - D0 - Xeon 3500 */ | ||
| 513 | /* 321324.pdf - AAK139 - D0 - Xeon 5500 */ | ||
| 514 | /* 320836.pdf - AAJ124 - D0 - i7-900 Extreme and i7-900 Desktop */ | ||
| 515 | 0x000106A5, | ||
| 516 | /* Xeon E3-1220 V2 */ | ||
| 517 | 0x000306A8, | ||
| 518 | }; | ||
| 519 | |||
| 520 | static inline bool cpu_has_broken_vmx_preemption_timer(void) | ||
| 521 | { | ||
| 522 | u32 eax = cpuid_eax(0x00000001), i; | ||
| 523 | |||
| 524 | /* Clear the reserved bits */ | ||
| 525 | eax &= ~(0x3U << 14 | 0xfU << 28); | ||
| 526 | for (i = 0; i < ARRAY_SIZE(vmx_preemption_cpu_tfms); i++) | ||
| 527 | if (eax == vmx_preemption_cpu_tfms[i]) | ||
| 528 | return true; | ||
| 529 | |||
| 530 | return false; | ||
| 531 | } | ||
| 532 | |||
| 533 | static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu) | ||
| 534 | { | ||
| 535 | return flexpriority_enabled && lapic_in_kernel(vcpu); | ||
| 536 | } | ||
| 537 | |||
| 538 | static inline bool report_flexpriority(void) | ||
| 539 | { | ||
| 540 | return flexpriority_enabled; | ||
| 541 | } | ||
| 542 | |||
| 543 | static inline int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) | ||
| 544 | { | ||
| 545 | int i; | ||
| 546 | |||
| 547 | for (i = 0; i < vmx->nmsrs; ++i) | ||
| 548 | if (vmx_msr_index[vmx->guest_msrs[i].index] == msr) | ||
| 549 | return i; | ||
| 550 | return -1; | ||
| 551 | } | ||
| 552 | |||
| 553 | struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr) | ||
| 554 | { | ||
| 555 | int i; | ||
| 556 | |||
| 557 | i = __find_msr_index(vmx, msr); | ||
| 558 | if (i >= 0) | ||
| 559 | return &vmx->guest_msrs[i]; | ||
| 560 | return NULL; | ||
| 561 | } | ||
| 562 | |||
| 563 | void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs) | ||
| 564 | { | ||
| 565 | vmcs_clear(loaded_vmcs->vmcs); | ||
| 566 | if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched) | ||
| 567 | vmcs_clear(loaded_vmcs->shadow_vmcs); | ||
| 568 | loaded_vmcs->cpu = -1; | ||
| 569 | loaded_vmcs->launched = 0; | ||
| 570 | } | ||
| 571 | |||
| 572 | #ifdef CONFIG_KEXEC_CORE | ||
| 573 | /* | ||
| 574 | * This bitmap is used to indicate whether the vmclear | ||
| 575 | * operation is enabled on all cpus. All disabled by | ||
| 576 | * default. | ||
| 577 | */ | ||
| 578 | static cpumask_t crash_vmclear_enabled_bitmap = CPU_MASK_NONE; | ||
| 579 | |||
| 580 | static inline void crash_enable_local_vmclear(int cpu) | ||
| 581 | { | ||
| 582 | cpumask_set_cpu(cpu, &crash_vmclear_enabled_bitmap); | ||
| 583 | } | ||
| 584 | |||
| 585 | static inline void crash_disable_local_vmclear(int cpu) | ||
| 586 | { | ||
| 587 | cpumask_clear_cpu(cpu, &crash_vmclear_enabled_bitmap); | ||
| 588 | } | ||
| 589 | |||
| 590 | static inline int crash_local_vmclear_enabled(int cpu) | ||
| 591 | { | ||
| 592 | return cpumask_test_cpu(cpu, &crash_vmclear_enabled_bitmap); | ||
| 593 | } | ||
| 594 | |||
| 595 | static void crash_vmclear_local_loaded_vmcss(void) | ||
| 596 | { | ||
| 597 | int cpu = raw_smp_processor_id(); | ||
| 598 | struct loaded_vmcs *v; | ||
| 599 | |||
| 600 | if (!crash_local_vmclear_enabled(cpu)) | ||
| 601 | return; | ||
| 602 | |||
| 603 | list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu), | ||
| 604 | loaded_vmcss_on_cpu_link) | ||
| 605 | vmcs_clear(v->vmcs); | ||
| 606 | } | ||
| 607 | #else | ||
| 608 | static inline void crash_enable_local_vmclear(int cpu) { } | ||
| 609 | static inline void crash_disable_local_vmclear(int cpu) { } | ||
| 610 | #endif /* CONFIG_KEXEC_CORE */ | ||
| 611 | |||
| 612 | static void __loaded_vmcs_clear(void *arg) | ||
| 613 | { | ||
| 614 | struct loaded_vmcs *loaded_vmcs = arg; | ||
| 615 | int cpu = raw_smp_processor_id(); | ||
| 616 | |||
| 617 | if (loaded_vmcs->cpu != cpu) | ||
| 618 | return; /* vcpu migration can race with cpu offline */ | ||
| 619 | if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs) | ||
| 620 | per_cpu(current_vmcs, cpu) = NULL; | ||
| 621 | crash_disable_local_vmclear(cpu); | ||
| 622 | list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link); | ||
| 623 | |||
| 624 | /* | ||
| 625 | * we should ensure updating loaded_vmcs->loaded_vmcss_on_cpu_link | ||
| 626 | * is before setting loaded_vmcs->vcpu to -1 which is done in | ||
| 627 | * loaded_vmcs_init. Otherwise, other cpu can see vcpu = -1 fist | ||
| 628 | * then adds the vmcs into percpu list before it is deleted. | ||
| 629 | */ | ||
| 630 | smp_wmb(); | ||
| 631 | |||
| 632 | loaded_vmcs_init(loaded_vmcs); | ||
| 633 | crash_enable_local_vmclear(cpu); | ||
| 634 | } | ||
| 635 | |||
| 636 | void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs) | ||
| 637 | { | ||
| 638 | int cpu = loaded_vmcs->cpu; | ||
| 639 | |||
| 640 | if (cpu != -1) | ||
| 641 | smp_call_function_single(cpu, | ||
| 642 | __loaded_vmcs_clear, loaded_vmcs, 1); | ||
| 643 | } | ||
| 644 | |||
| 645 | static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg, | ||
| 646 | unsigned field) | ||
| 647 | { | ||
| 648 | bool ret; | ||
| 649 | u32 mask = 1 << (seg * SEG_FIELD_NR + field); | ||
| 650 | |||
| 651 | if (!(vmx->vcpu.arch.regs_avail & (1 << VCPU_EXREG_SEGMENTS))) { | ||
| 652 | vmx->vcpu.arch.regs_avail |= (1 << VCPU_EXREG_SEGMENTS); | ||
| 653 | vmx->segment_cache.bitmask = 0; | ||
| 654 | } | ||
| 655 | ret = vmx->segment_cache.bitmask & mask; | ||
| 656 | vmx->segment_cache.bitmask |= mask; | ||
| 657 | return ret; | ||
| 658 | } | ||
| 659 | |||
| 660 | static u16 vmx_read_guest_seg_selector(struct vcpu_vmx *vmx, unsigned seg) | ||
| 661 | { | ||
| 662 | u16 *p = &vmx->segment_cache.seg[seg].selector; | ||
| 663 | |||
| 664 | if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_SEL)) | ||
| 665 | *p = vmcs_read16(kvm_vmx_segment_fields[seg].selector); | ||
| 666 | return *p; | ||
| 667 | } | ||
| 668 | |||
| 669 | static ulong vmx_read_guest_seg_base(struct vcpu_vmx *vmx, unsigned seg) | ||
| 670 | { | ||
| 671 | ulong *p = &vmx->segment_cache.seg[seg].base; | ||
| 672 | |||
| 673 | if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_BASE)) | ||
| 674 | *p = vmcs_readl(kvm_vmx_segment_fields[seg].base); | ||
| 675 | return *p; | ||
| 676 | } | ||
| 677 | |||
| 678 | static u32 vmx_read_guest_seg_limit(struct vcpu_vmx *vmx, unsigned seg) | ||
| 679 | { | ||
| 680 | u32 *p = &vmx->segment_cache.seg[seg].limit; | ||
| 681 | |||
| 682 | if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_LIMIT)) | ||
| 683 | *p = vmcs_read32(kvm_vmx_segment_fields[seg].limit); | ||
| 684 | return *p; | ||
| 685 | } | ||
| 686 | |||
| 687 | static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg) | ||
| 688 | { | ||
| 689 | u32 *p = &vmx->segment_cache.seg[seg].ar; | ||
| 690 | |||
| 691 | if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_AR)) | ||
| 692 | *p = vmcs_read32(kvm_vmx_segment_fields[seg].ar_bytes); | ||
| 693 | return *p; | ||
| 694 | } | ||
| 695 | |||
| 696 | void update_exception_bitmap(struct kvm_vcpu *vcpu) | ||
| 697 | { | ||
| 698 | u32 eb; | ||
| 699 | |||
| 700 | eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) | | ||
| 701 | (1u << DB_VECTOR) | (1u << AC_VECTOR); | ||
| 702 | /* | ||
| 703 | * Guest access to VMware backdoor ports could legitimately | ||
| 704 | * trigger #GP because of TSS I/O permission bitmap. | ||
| 705 | * We intercept those #GP and allow access to them anyway | ||
| 706 | * as VMware does. | ||
| 707 | */ | ||
| 708 | if (enable_vmware_backdoor) | ||
| 709 | eb |= (1u << GP_VECTOR); | ||
| 710 | if ((vcpu->guest_debug & | ||
| 711 | (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) == | ||
| 712 | (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) | ||
| 713 | eb |= 1u << BP_VECTOR; | ||
| 714 | if (to_vmx(vcpu)->rmode.vm86_active) | ||
| 715 | eb = ~0; | ||
| 716 | if (enable_ept) | ||
| 717 | eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */ | ||
| 718 | |||
| 719 | /* When we are running a nested L2 guest and L1 specified for it a | ||
| 720 | * certain exception bitmap, we must trap the same exceptions and pass | ||
| 721 | * them to L1. When running L2, we will only handle the exceptions | ||
| 722 | * specified above if L1 did not want them. | ||
| 723 | */ | ||
| 724 | if (is_guest_mode(vcpu)) | ||
| 725 | eb |= get_vmcs12(vcpu)->exception_bitmap; | ||
| 726 | |||
| 727 | vmcs_write32(EXCEPTION_BITMAP, eb); | ||
| 728 | } | ||
| 729 | |||
| 730 | /* | ||
| 731 | * Check if MSR is intercepted for currently loaded MSR bitmap. | ||
| 732 | */ | ||
| 733 | static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr) | ||
| 734 | { | ||
| 735 | unsigned long *msr_bitmap; | ||
| 736 | int f = sizeof(unsigned long); | ||
| 737 | |||
| 738 | if (!cpu_has_vmx_msr_bitmap()) | ||
| 739 | return true; | ||
| 740 | |||
| 741 | msr_bitmap = to_vmx(vcpu)->loaded_vmcs->msr_bitmap; | ||
| 742 | |||
| 743 | if (msr <= 0x1fff) { | ||
| 744 | return !!test_bit(msr, msr_bitmap + 0x800 / f); | ||
| 745 | } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { | ||
| 746 | msr &= 0x1fff; | ||
| 747 | return !!test_bit(msr, msr_bitmap + 0xc00 / f); | ||
| 748 | } | ||
| 749 | |||
| 750 | return true; | ||
| 751 | } | ||
| 752 | |||
| 753 | static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx, | ||
| 754 | unsigned long entry, unsigned long exit) | ||
| 755 | { | ||
| 756 | vm_entry_controls_clearbit(vmx, entry); | ||
| 757 | vm_exit_controls_clearbit(vmx, exit); | ||
| 758 | } | ||
| 759 | |||
| 760 | static int find_msr(struct vmx_msrs *m, unsigned int msr) | ||
| 761 | { | ||
| 762 | unsigned int i; | ||
| 763 | |||
| 764 | for (i = 0; i < m->nr; ++i) { | ||
| 765 | if (m->val[i].index == msr) | ||
| 766 | return i; | ||
| 767 | } | ||
| 768 | return -ENOENT; | ||
| 769 | } | ||
| 770 | |||
| 771 | static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr) | ||
| 772 | { | ||
| 773 | int i; | ||
| 774 | struct msr_autoload *m = &vmx->msr_autoload; | ||
| 775 | |||
| 776 | switch (msr) { | ||
| 777 | case MSR_EFER: | ||
| 778 | if (cpu_has_load_ia32_efer()) { | ||
| 779 | clear_atomic_switch_msr_special(vmx, | ||
| 780 | VM_ENTRY_LOAD_IA32_EFER, | ||
| 781 | VM_EXIT_LOAD_IA32_EFER); | ||
| 782 | return; | ||
| 783 | } | ||
| 784 | break; | ||
| 785 | case MSR_CORE_PERF_GLOBAL_CTRL: | ||
| 786 | if (cpu_has_load_perf_global_ctrl()) { | ||
| 787 | clear_atomic_switch_msr_special(vmx, | ||
| 788 | VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, | ||
| 789 | VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL); | ||
| 790 | return; | ||
| 791 | } | ||
| 792 | break; | ||
| 793 | } | ||
| 794 | i = find_msr(&m->guest, msr); | ||
| 795 | if (i < 0) | ||
| 796 | goto skip_guest; | ||
| 797 | --m->guest.nr; | ||
| 798 | m->guest.val[i] = m->guest.val[m->guest.nr]; | ||
| 799 | vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr); | ||
| 800 | |||
| 801 | skip_guest: | ||
| 802 | i = find_msr(&m->host, msr); | ||
| 803 | if (i < 0) | ||
| 804 | return; | ||
| 805 | |||
| 806 | --m->host.nr; | ||
| 807 | m->host.val[i] = m->host.val[m->host.nr]; | ||
| 808 | vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr); | ||
| 809 | } | ||
| 810 | |||
| 811 | static void add_atomic_switch_msr_special(struct vcpu_vmx *vmx, | ||
| 812 | unsigned long entry, unsigned long exit, | ||
| 813 | unsigned long guest_val_vmcs, unsigned long host_val_vmcs, | ||
| 814 | u64 guest_val, u64 host_val) | ||
| 815 | { | ||
| 816 | vmcs_write64(guest_val_vmcs, guest_val); | ||
| 817 | if (host_val_vmcs != HOST_IA32_EFER) | ||
| 818 | vmcs_write64(host_val_vmcs, host_val); | ||
| 819 | vm_entry_controls_setbit(vmx, entry); | ||
| 820 | vm_exit_controls_setbit(vmx, exit); | ||
| 821 | } | ||
| 822 | |||
| 823 | static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, | ||
| 824 | u64 guest_val, u64 host_val, bool entry_only) | ||
| 825 | { | ||
| 826 | int i, j = 0; | ||
| 827 | struct msr_autoload *m = &vmx->msr_autoload; | ||
| 828 | |||
| 829 | switch (msr) { | ||
| 830 | case MSR_EFER: | ||
| 831 | if (cpu_has_load_ia32_efer()) { | ||
| 832 | add_atomic_switch_msr_special(vmx, | ||
| 833 | VM_ENTRY_LOAD_IA32_EFER, | ||
| 834 | VM_EXIT_LOAD_IA32_EFER, | ||
| 835 | GUEST_IA32_EFER, | ||
| 836 | HOST_IA32_EFER, | ||
| 837 | guest_val, host_val); | ||
| 838 | return; | ||
| 839 | } | ||
| 840 | break; | ||
| 841 | case MSR_CORE_PERF_GLOBAL_CTRL: | ||
| 842 | if (cpu_has_load_perf_global_ctrl()) { | ||
| 843 | add_atomic_switch_msr_special(vmx, | ||
| 844 | VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, | ||
| 845 | VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL, | ||
| 846 | GUEST_IA32_PERF_GLOBAL_CTRL, | ||
| 847 | HOST_IA32_PERF_GLOBAL_CTRL, | ||
| 848 | guest_val, host_val); | ||
| 849 | return; | ||
| 850 | } | ||
| 851 | break; | ||
| 852 | case MSR_IA32_PEBS_ENABLE: | ||
| 853 | /* PEBS needs a quiescent period after being disabled (to write | ||
| 854 | * a record). Disabling PEBS through VMX MSR swapping doesn't | ||
| 855 | * provide that period, so a CPU could write host's record into | ||
| 856 | * guest's memory. | ||
| 857 | */ | ||
| 858 | wrmsrl(MSR_IA32_PEBS_ENABLE, 0); | ||
| 859 | } | ||
| 860 | |||
| 861 | i = find_msr(&m->guest, msr); | ||
| 862 | if (!entry_only) | ||
| 863 | j = find_msr(&m->host, msr); | ||
| 864 | |||
| 865 | if (i == NR_AUTOLOAD_MSRS || j == NR_AUTOLOAD_MSRS) { | ||
| 866 | printk_once(KERN_WARNING "Not enough msr switch entries. " | ||
| 867 | "Can't add msr %x\n", msr); | ||
| 868 | return; | ||
| 869 | } | ||
| 870 | if (i < 0) { | ||
| 871 | i = m->guest.nr++; | ||
| 872 | vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr); | ||
| 873 | } | ||
| 874 | m->guest.val[i].index = msr; | ||
| 875 | m->guest.val[i].value = guest_val; | ||
| 876 | |||
| 877 | if (entry_only) | ||
| 878 | return; | ||
| 879 | |||
| 880 | if (j < 0) { | ||
| 881 | j = m->host.nr++; | ||
| 882 | vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr); | ||
| 883 | } | ||
| 884 | m->host.val[j].index = msr; | ||
| 885 | m->host.val[j].value = host_val; | ||
| 886 | } | ||
| 887 | |||
| 888 | static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset) | ||
| 889 | { | ||
| 890 | u64 guest_efer = vmx->vcpu.arch.efer; | ||
| 891 | u64 ignore_bits = 0; | ||
| 892 | |||
| 893 | if (!enable_ept) { | ||
| 894 | /* | ||
| 895 | * NX is needed to handle CR0.WP=1, CR4.SMEP=1. Testing | ||
| 896 | * host CPUID is more efficient than testing guest CPUID | ||
| 897 | * or CR4. Host SMEP is anyway a requirement for guest SMEP. | ||
| 898 | */ | ||
| 899 | if (boot_cpu_has(X86_FEATURE_SMEP)) | ||
| 900 | guest_efer |= EFER_NX; | ||
| 901 | else if (!(guest_efer & EFER_NX)) | ||
| 902 | ignore_bits |= EFER_NX; | ||
| 903 | } | ||
| 904 | |||
| 905 | /* | ||
| 906 | * LMA and LME handled by hardware; SCE meaningless outside long mode. | ||
| 907 | */ | ||
| 908 | ignore_bits |= EFER_SCE; | ||
| 909 | #ifdef CONFIG_X86_64 | ||
| 910 | ignore_bits |= EFER_LMA | EFER_LME; | ||
| 911 | /* SCE is meaningful only in long mode on Intel */ | ||
| 912 | if (guest_efer & EFER_LMA) | ||
| 913 | ignore_bits &= ~(u64)EFER_SCE; | ||
| 914 | #endif | ||
| 915 | |||
| 916 | /* | ||
| 917 | * On EPT, we can't emulate NX, so we must switch EFER atomically. | ||
| 918 | * On CPUs that support "load IA32_EFER", always switch EFER | ||
| 919 | * atomically, since it's faster than switching it manually. | ||
| 920 | */ | ||
| 921 | if (cpu_has_load_ia32_efer() || | ||
| 922 | (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX))) { | ||
| 923 | if (!(guest_efer & EFER_LMA)) | ||
| 924 | guest_efer &= ~EFER_LME; | ||
| 925 | if (guest_efer != host_efer) | ||
| 926 | add_atomic_switch_msr(vmx, MSR_EFER, | ||
| 927 | guest_efer, host_efer, false); | ||
| 928 | else | ||
| 929 | clear_atomic_switch_msr(vmx, MSR_EFER); | ||
| 930 | return false; | ||
| 931 | } else { | ||
| 932 | clear_atomic_switch_msr(vmx, MSR_EFER); | ||
| 933 | |||
| 934 | guest_efer &= ~ignore_bits; | ||
| 935 | guest_efer |= host_efer & ignore_bits; | ||
| 936 | |||
| 937 | vmx->guest_msrs[efer_offset].data = guest_efer; | ||
| 938 | vmx->guest_msrs[efer_offset].mask = ~ignore_bits; | ||
| 939 | |||
| 940 | return true; | ||
| 941 | } | ||
| 942 | } | ||
| 943 | |||
| 944 | #ifdef CONFIG_X86_32 | ||
| 945 | /* | ||
| 946 | * On 32-bit kernels, VM exits still load the FS and GS bases from the | ||
| 947 | * VMCS rather than the segment table. KVM uses this helper to figure | ||
| 948 | * out the current bases to poke them into the VMCS before entry. | ||
| 949 | */ | ||
| 950 | static unsigned long segment_base(u16 selector) | ||
| 951 | { | ||
| 952 | struct desc_struct *table; | ||
| 953 | unsigned long v; | ||
| 954 | |||
| 955 | if (!(selector & ~SEGMENT_RPL_MASK)) | ||
| 956 | return 0; | ||
| 957 | |||
| 958 | table = get_current_gdt_ro(); | ||
| 959 | |||
| 960 | if ((selector & SEGMENT_TI_MASK) == SEGMENT_LDT) { | ||
| 961 | u16 ldt_selector = kvm_read_ldt(); | ||
| 962 | |||
| 963 | if (!(ldt_selector & ~SEGMENT_RPL_MASK)) | ||
| 964 | return 0; | ||
| 965 | |||
| 966 | table = (struct desc_struct *)segment_base(ldt_selector); | ||
| 967 | } | ||
| 968 | v = get_desc_base(&table[selector >> 3]); | ||
| 969 | return v; | ||
| 970 | } | ||
| 971 | #endif | ||
| 972 | |||
| 973 | static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range) | ||
| 974 | { | ||
| 975 | u32 i; | ||
| 976 | |||
| 977 | wrmsrl(MSR_IA32_RTIT_STATUS, ctx->status); | ||
| 978 | wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base); | ||
| 979 | wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask); | ||
| 980 | wrmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match); | ||
| 981 | for (i = 0; i < addr_range; i++) { | ||
| 982 | wrmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]); | ||
| 983 | wrmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]); | ||
| 984 | } | ||
| 985 | } | ||
| 986 | |||
| 987 | static inline void pt_save_msr(struct pt_ctx *ctx, u32 addr_range) | ||
| 988 | { | ||
| 989 | u32 i; | ||
| 990 | |||
| 991 | rdmsrl(MSR_IA32_RTIT_STATUS, ctx->status); | ||
| 992 | rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base); | ||
| 993 | rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask); | ||
| 994 | rdmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match); | ||
| 995 | for (i = 0; i < addr_range; i++) { | ||
| 996 | rdmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]); | ||
| 997 | rdmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]); | ||
| 998 | } | ||
| 999 | } | ||
| 1000 | |||
| 1001 | static void pt_guest_enter(struct vcpu_vmx *vmx) | ||
| 1002 | { | ||
| 1003 | if (pt_mode == PT_MODE_SYSTEM) | ||
| 1004 | return; | ||
| 1005 | |||
| 1006 | /* | ||
| 1007 | * GUEST_IA32_RTIT_CTL is already set in the VMCS. | ||
| 1008 | * Save host state before VM entry. | ||
| 1009 | */ | ||
| 1010 | rdmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl); | ||
| 1011 | if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) { | ||
| 1012 | wrmsrl(MSR_IA32_RTIT_CTL, 0); | ||
| 1013 | pt_save_msr(&vmx->pt_desc.host, vmx->pt_desc.addr_range); | ||
| 1014 | pt_load_msr(&vmx->pt_desc.guest, vmx->pt_desc.addr_range); | ||
| 1015 | } | ||
| 1016 | } | ||
| 1017 | |||
| 1018 | static void pt_guest_exit(struct vcpu_vmx *vmx) | ||
| 1019 | { | ||
| 1020 | if (pt_mode == PT_MODE_SYSTEM) | ||
| 1021 | return; | ||
| 1022 | |||
| 1023 | if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) { | ||
| 1024 | pt_save_msr(&vmx->pt_desc.guest, vmx->pt_desc.addr_range); | ||
| 1025 | pt_load_msr(&vmx->pt_desc.host, vmx->pt_desc.addr_range); | ||
| 1026 | } | ||
| 1027 | |||
| 1028 | /* Reload host state (IA32_RTIT_CTL will be cleared on VM exit). */ | ||
| 1029 | wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl); | ||
| 1030 | } | ||
| 1031 | |||
| 1032 | void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) | ||
| 1033 | { | ||
| 1034 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 1035 | struct vmcs_host_state *host_state; | ||
| 1036 | #ifdef CONFIG_X86_64 | ||
| 1037 | int cpu = raw_smp_processor_id(); | ||
| 1038 | #endif | ||
| 1039 | unsigned long fs_base, gs_base; | ||
| 1040 | u16 fs_sel, gs_sel; | ||
| 1041 | int i; | ||
| 1042 | |||
| 1043 | vmx->req_immediate_exit = false; | ||
| 1044 | |||
| 1045 | /* | ||
| 1046 | * Note that guest MSRs to be saved/restored can also be changed | ||
| 1047 | * when guest state is loaded. This happens when guest transitions | ||
| 1048 | * to/from long-mode by setting MSR_EFER.LMA. | ||
| 1049 | */ | ||
| 1050 | if (!vmx->loaded_cpu_state || vmx->guest_msrs_dirty) { | ||
| 1051 | vmx->guest_msrs_dirty = false; | ||
| 1052 | for (i = 0; i < vmx->save_nmsrs; ++i) | ||
| 1053 | kvm_set_shared_msr(vmx->guest_msrs[i].index, | ||
| 1054 | vmx->guest_msrs[i].data, | ||
| 1055 | vmx->guest_msrs[i].mask); | ||
| 1056 | |||
| 1057 | } | ||
| 1058 | |||
| 1059 | if (vmx->loaded_cpu_state) | ||
| 1060 | return; | ||
| 1061 | |||
| 1062 | vmx->loaded_cpu_state = vmx->loaded_vmcs; | ||
| 1063 | host_state = &vmx->loaded_cpu_state->host_state; | ||
| 1064 | |||
| 1065 | /* | ||
| 1066 | * Set host fs and gs selectors. Unfortunately, 22.2.3 does not | ||
| 1067 | * allow segment selectors with cpl > 0 or ti == 1. | ||
| 1068 | */ | ||
| 1069 | host_state->ldt_sel = kvm_read_ldt(); | ||
| 1070 | |||
| 1071 | #ifdef CONFIG_X86_64 | ||
| 1072 | savesegment(ds, host_state->ds_sel); | ||
| 1073 | savesegment(es, host_state->es_sel); | ||
| 1074 | |||
| 1075 | gs_base = cpu_kernelmode_gs_base(cpu); | ||
| 1076 | if (likely(is_64bit_mm(current->mm))) { | ||
| 1077 | save_fsgs_for_kvm(); | ||
| 1078 | fs_sel = current->thread.fsindex; | ||
| 1079 | gs_sel = current->thread.gsindex; | ||
| 1080 | fs_base = current->thread.fsbase; | ||
| 1081 | vmx->msr_host_kernel_gs_base = current->thread.gsbase; | ||
| 1082 | } else { | ||
| 1083 | savesegment(fs, fs_sel); | ||
| 1084 | savesegment(gs, gs_sel); | ||
| 1085 | fs_base = read_msr(MSR_FS_BASE); | ||
| 1086 | vmx->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE); | ||
| 1087 | } | ||
| 1088 | |||
| 1089 | wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); | ||
| 1090 | #else | ||
| 1091 | savesegment(fs, fs_sel); | ||
| 1092 | savesegment(gs, gs_sel); | ||
| 1093 | fs_base = segment_base(fs_sel); | ||
| 1094 | gs_base = segment_base(gs_sel); | ||
| 1095 | #endif | ||
| 1096 | |||
| 1097 | if (unlikely(fs_sel != host_state->fs_sel)) { | ||
| 1098 | if (!(fs_sel & 7)) | ||
| 1099 | vmcs_write16(HOST_FS_SELECTOR, fs_sel); | ||
| 1100 | else | ||
| 1101 | vmcs_write16(HOST_FS_SELECTOR, 0); | ||
| 1102 | host_state->fs_sel = fs_sel; | ||
| 1103 | } | ||
| 1104 | if (unlikely(gs_sel != host_state->gs_sel)) { | ||
| 1105 | if (!(gs_sel & 7)) | ||
| 1106 | vmcs_write16(HOST_GS_SELECTOR, gs_sel); | ||
| 1107 | else | ||
| 1108 | vmcs_write16(HOST_GS_SELECTOR, 0); | ||
| 1109 | host_state->gs_sel = gs_sel; | ||
| 1110 | } | ||
| 1111 | if (unlikely(fs_base != host_state->fs_base)) { | ||
| 1112 | vmcs_writel(HOST_FS_BASE, fs_base); | ||
| 1113 | host_state->fs_base = fs_base; | ||
| 1114 | } | ||
| 1115 | if (unlikely(gs_base != host_state->gs_base)) { | ||
| 1116 | vmcs_writel(HOST_GS_BASE, gs_base); | ||
| 1117 | host_state->gs_base = gs_base; | ||
| 1118 | } | ||
| 1119 | } | ||
| 1120 | |||
| 1121 | static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx) | ||
| 1122 | { | ||
| 1123 | struct vmcs_host_state *host_state; | ||
| 1124 | |||
| 1125 | if (!vmx->loaded_cpu_state) | ||
| 1126 | return; | ||
| 1127 | |||
| 1128 | WARN_ON_ONCE(vmx->loaded_cpu_state != vmx->loaded_vmcs); | ||
| 1129 | host_state = &vmx->loaded_cpu_state->host_state; | ||
| 1130 | |||
| 1131 | ++vmx->vcpu.stat.host_state_reload; | ||
| 1132 | vmx->loaded_cpu_state = NULL; | ||
| 1133 | |||
| 1134 | #ifdef CONFIG_X86_64 | ||
| 1135 | rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); | ||
| 1136 | #endif | ||
| 1137 | if (host_state->ldt_sel || (host_state->gs_sel & 7)) { | ||
| 1138 | kvm_load_ldt(host_state->ldt_sel); | ||
| 1139 | #ifdef CONFIG_X86_64 | ||
| 1140 | load_gs_index(host_state->gs_sel); | ||
| 1141 | #else | ||
| 1142 | loadsegment(gs, host_state->gs_sel); | ||
| 1143 | #endif | ||
| 1144 | } | ||
| 1145 | if (host_state->fs_sel & 7) | ||
| 1146 | loadsegment(fs, host_state->fs_sel); | ||
| 1147 | #ifdef CONFIG_X86_64 | ||
| 1148 | if (unlikely(host_state->ds_sel | host_state->es_sel)) { | ||
| 1149 | loadsegment(ds, host_state->ds_sel); | ||
| 1150 | loadsegment(es, host_state->es_sel); | ||
| 1151 | } | ||
| 1152 | #endif | ||
| 1153 | invalidate_tss_limit(); | ||
| 1154 | #ifdef CONFIG_X86_64 | ||
| 1155 | wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); | ||
| 1156 | #endif | ||
| 1157 | load_fixmap_gdt(raw_smp_processor_id()); | ||
| 1158 | } | ||
| 1159 | |||
| 1160 | #ifdef CONFIG_X86_64 | ||
| 1161 | static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx) | ||
| 1162 | { | ||
| 1163 | preempt_disable(); | ||
| 1164 | if (vmx->loaded_cpu_state) | ||
| 1165 | rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); | ||
| 1166 | preempt_enable(); | ||
| 1167 | return vmx->msr_guest_kernel_gs_base; | ||
| 1168 | } | ||
| 1169 | |||
| 1170 | static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data) | ||
| 1171 | { | ||
| 1172 | preempt_disable(); | ||
| 1173 | if (vmx->loaded_cpu_state) | ||
| 1174 | wrmsrl(MSR_KERNEL_GS_BASE, data); | ||
| 1175 | preempt_enable(); | ||
| 1176 | vmx->msr_guest_kernel_gs_base = data; | ||
| 1177 | } | ||
| 1178 | #endif | ||
| 1179 | |||
| 1180 | static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) | ||
| 1181 | { | ||
| 1182 | struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); | ||
| 1183 | struct pi_desc old, new; | ||
| 1184 | unsigned int dest; | ||
| 1185 | |||
| 1186 | /* | ||
| 1187 | * In case of hot-plug or hot-unplug, we may have to undo | ||
| 1188 | * vmx_vcpu_pi_put even if there is no assigned device. And we | ||
| 1189 | * always keep PI.NDST up to date for simplicity: it makes the | ||
| 1190 | * code easier, and CPU migration is not a fast path. | ||
| 1191 | */ | ||
| 1192 | if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu) | ||
| 1193 | return; | ||
| 1194 | |||
| 1195 | /* | ||
| 1196 | * First handle the simple case where no cmpxchg is necessary; just | ||
| 1197 | * allow posting non-urgent interrupts. | ||
| 1198 | * | ||
| 1199 | * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change | ||
| 1200 | * PI.NDST: pi_post_block will do it for us and the wakeup_handler | ||
| 1201 | * expects the VCPU to be on the blocked_vcpu_list that matches | ||
| 1202 | * PI.NDST. | ||
| 1203 | */ | ||
| 1204 | if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR || | ||
| 1205 | vcpu->cpu == cpu) { | ||
| 1206 | pi_clear_sn(pi_desc); | ||
| 1207 | return; | ||
| 1208 | } | ||
| 1209 | |||
| 1210 | /* The full case. */ | ||
| 1211 | do { | ||
| 1212 | old.control = new.control = pi_desc->control; | ||
| 1213 | |||
| 1214 | dest = cpu_physical_id(cpu); | ||
| 1215 | |||
| 1216 | if (x2apic_enabled()) | ||
| 1217 | new.ndst = dest; | ||
| 1218 | else | ||
| 1219 | new.ndst = (dest << 8) & 0xFF00; | ||
| 1220 | |||
| 1221 | new.sn = 0; | ||
| 1222 | } while (cmpxchg64(&pi_desc->control, old.control, | ||
| 1223 | new.control) != old.control); | ||
| 1224 | } | ||
| 1225 | |||
| 1226 | /* | ||
| 1227 | * Switches to specified vcpu, until a matching vcpu_put(), but assumes | ||
| 1228 | * vcpu mutex is already taken. | ||
| 1229 | */ | ||
| 1230 | void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) | ||
| 1231 | { | ||
| 1232 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 1233 | bool already_loaded = vmx->loaded_vmcs->cpu == cpu; | ||
| 1234 | |||
| 1235 | if (!already_loaded) { | ||
| 1236 | loaded_vmcs_clear(vmx->loaded_vmcs); | ||
| 1237 | local_irq_disable(); | ||
| 1238 | crash_disable_local_vmclear(cpu); | ||
| 1239 | |||
| 1240 | /* | ||
| 1241 | * Read loaded_vmcs->cpu should be before fetching | ||
| 1242 | * loaded_vmcs->loaded_vmcss_on_cpu_link. | ||
| 1243 | * See the comments in __loaded_vmcs_clear(). | ||
| 1244 | */ | ||
| 1245 | smp_rmb(); | ||
| 1246 | |||
| 1247 | list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link, | ||
| 1248 | &per_cpu(loaded_vmcss_on_cpu, cpu)); | ||
| 1249 | crash_enable_local_vmclear(cpu); | ||
| 1250 | local_irq_enable(); | ||
| 1251 | } | ||
| 1252 | |||
| 1253 | if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) { | ||
| 1254 | per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs; | ||
| 1255 | vmcs_load(vmx->loaded_vmcs->vmcs); | ||
| 1256 | indirect_branch_prediction_barrier(); | ||
| 1257 | } | ||
| 1258 | |||
| 1259 | if (!already_loaded) { | ||
| 1260 | void *gdt = get_current_gdt_ro(); | ||
| 1261 | unsigned long sysenter_esp; | ||
| 1262 | |||
| 1263 | kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); | ||
| 1264 | |||
| 1265 | /* | ||
| 1266 | * Linux uses per-cpu TSS and GDT, so set these when switching | ||
| 1267 | * processors. See 22.2.4. | ||
| 1268 | */ | ||
| 1269 | vmcs_writel(HOST_TR_BASE, | ||
| 1270 | (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss); | ||
| 1271 | vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */ | ||
| 1272 | |||
| 1273 | /* | ||
| 1274 | * VM exits change the host TR limit to 0x67 after a VM | ||
| 1275 | * exit. This is okay, since 0x67 covers everything except | ||
| 1276 | * the IO bitmap and have have code to handle the IO bitmap | ||
| 1277 | * being lost after a VM exit. | ||
| 1278 | */ | ||
| 1279 | BUILD_BUG_ON(IO_BITMAP_OFFSET - 1 != 0x67); | ||
| 1280 | |||
| 1281 | rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); | ||
| 1282 | vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */ | ||
| 1283 | |||
| 1284 | vmx->loaded_vmcs->cpu = cpu; | ||
| 1285 | } | ||
| 1286 | |||
| 1287 | /* Setup TSC multiplier */ | ||
| 1288 | if (kvm_has_tsc_control && | ||
| 1289 | vmx->current_tsc_ratio != vcpu->arch.tsc_scaling_ratio) | ||
| 1290 | decache_tsc_multiplier(vmx); | ||
| 1291 | |||
| 1292 | vmx_vcpu_pi_load(vcpu, cpu); | ||
| 1293 | vmx->host_pkru = read_pkru(); | ||
| 1294 | vmx->host_debugctlmsr = get_debugctlmsr(); | ||
| 1295 | } | ||
| 1296 | |||
| 1297 | static void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu) | ||
| 1298 | { | ||
| 1299 | struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); | ||
| 1300 | |||
| 1301 | if (!kvm_arch_has_assigned_device(vcpu->kvm) || | ||
| 1302 | !irq_remapping_cap(IRQ_POSTING_CAP) || | ||
| 1303 | !kvm_vcpu_apicv_active(vcpu)) | ||
| 1304 | return; | ||
| 1305 | |||
| 1306 | /* Set SN when the vCPU is preempted */ | ||
| 1307 | if (vcpu->preempted) | ||
| 1308 | pi_set_sn(pi_desc); | ||
| 1309 | } | ||
| 1310 | |||
| 1311 | void vmx_vcpu_put(struct kvm_vcpu *vcpu) | ||
| 1312 | { | ||
| 1313 | vmx_vcpu_pi_put(vcpu); | ||
| 1314 | |||
| 1315 | vmx_prepare_switch_to_host(to_vmx(vcpu)); | ||
| 1316 | } | ||
| 1317 | |||
| 1318 | static bool emulation_required(struct kvm_vcpu *vcpu) | ||
| 1319 | { | ||
| 1320 | return emulate_invalid_guest_state && !guest_state_valid(vcpu); | ||
| 1321 | } | ||
| 1322 | |||
| 1323 | static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu); | ||
| 1324 | |||
| 1325 | unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) | ||
| 1326 | { | ||
| 1327 | unsigned long rflags, save_rflags; | ||
| 1328 | |||
| 1329 | if (!test_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail)) { | ||
| 1330 | __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail); | ||
| 1331 | rflags = vmcs_readl(GUEST_RFLAGS); | ||
| 1332 | if (to_vmx(vcpu)->rmode.vm86_active) { | ||
| 1333 | rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS; | ||
| 1334 | save_rflags = to_vmx(vcpu)->rmode.save_rflags; | ||
| 1335 | rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS; | ||
| 1336 | } | ||
| 1337 | to_vmx(vcpu)->rflags = rflags; | ||
| 1338 | } | ||
| 1339 | return to_vmx(vcpu)->rflags; | ||
| 1340 | } | ||
| 1341 | |||
| 1342 | void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) | ||
| 1343 | { | ||
| 1344 | unsigned long old_rflags = vmx_get_rflags(vcpu); | ||
| 1345 | |||
| 1346 | __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail); | ||
| 1347 | to_vmx(vcpu)->rflags = rflags; | ||
| 1348 | if (to_vmx(vcpu)->rmode.vm86_active) { | ||
| 1349 | to_vmx(vcpu)->rmode.save_rflags = rflags; | ||
| 1350 | rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; | ||
| 1351 | } | ||
| 1352 | vmcs_writel(GUEST_RFLAGS, rflags); | ||
| 1353 | |||
| 1354 | if ((old_rflags ^ to_vmx(vcpu)->rflags) & X86_EFLAGS_VM) | ||
| 1355 | to_vmx(vcpu)->emulation_required = emulation_required(vcpu); | ||
| 1356 | } | ||
| 1357 | |||
| 1358 | u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu) | ||
| 1359 | { | ||
| 1360 | u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); | ||
| 1361 | int ret = 0; | ||
| 1362 | |||
| 1363 | if (interruptibility & GUEST_INTR_STATE_STI) | ||
| 1364 | ret |= KVM_X86_SHADOW_INT_STI; | ||
| 1365 | if (interruptibility & GUEST_INTR_STATE_MOV_SS) | ||
| 1366 | ret |= KVM_X86_SHADOW_INT_MOV_SS; | ||
| 1367 | |||
| 1368 | return ret; | ||
| 1369 | } | ||
| 1370 | |||
| 1371 | void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) | ||
| 1372 | { | ||
| 1373 | u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); | ||
| 1374 | u32 interruptibility = interruptibility_old; | ||
| 1375 | |||
| 1376 | interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS); | ||
| 1377 | |||
| 1378 | if (mask & KVM_X86_SHADOW_INT_MOV_SS) | ||
| 1379 | interruptibility |= GUEST_INTR_STATE_MOV_SS; | ||
| 1380 | else if (mask & KVM_X86_SHADOW_INT_STI) | ||
| 1381 | interruptibility |= GUEST_INTR_STATE_STI; | ||
| 1382 | |||
| 1383 | if ((interruptibility != interruptibility_old)) | ||
| 1384 | vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility); | ||
| 1385 | } | ||
| 1386 | |||
| 1387 | static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data) | ||
| 1388 | { | ||
| 1389 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 1390 | unsigned long value; | ||
| 1391 | |||
| 1392 | /* | ||
| 1393 | * Any MSR write that attempts to change bits marked reserved will | ||
| 1394 | * case a #GP fault. | ||
| 1395 | */ | ||
| 1396 | if (data & vmx->pt_desc.ctl_bitmask) | ||
| 1397 | return 1; | ||
| 1398 | |||
| 1399 | /* | ||
| 1400 | * Any attempt to modify IA32_RTIT_CTL while TraceEn is set will | ||
| 1401 | * result in a #GP unless the same write also clears TraceEn. | ||
| 1402 | */ | ||
| 1403 | if ((vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) && | ||
| 1404 | ((vmx->pt_desc.guest.ctl ^ data) & ~RTIT_CTL_TRACEEN)) | ||
| 1405 | return 1; | ||
| 1406 | |||
| 1407 | /* | ||
| 1408 | * WRMSR to IA32_RTIT_CTL that sets TraceEn but clears this bit | ||
| 1409 | * and FabricEn would cause #GP, if | ||
| 1410 | * CPUID.(EAX=14H, ECX=0):ECX.SNGLRGNOUT[bit 2] = 0 | ||
| 1411 | */ | ||
| 1412 | if ((data & RTIT_CTL_TRACEEN) && !(data & RTIT_CTL_TOPA) && | ||
| 1413 | !(data & RTIT_CTL_FABRIC_EN) && | ||
| 1414 | !intel_pt_validate_cap(vmx->pt_desc.caps, | ||
| 1415 | PT_CAP_single_range_output)) | ||
| 1416 | return 1; | ||
| 1417 | |||
| 1418 | /* | ||
| 1419 | * MTCFreq, CycThresh and PSBFreq encodings check, any MSR write that | ||
| 1420 | * utilize encodings marked reserved will casue a #GP fault. | ||
| 1421 | */ | ||
| 1422 | value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc_periods); | ||
| 1423 | if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc) && | ||
| 1424 | !test_bit((data & RTIT_CTL_MTC_RANGE) >> | ||
| 1425 | RTIT_CTL_MTC_RANGE_OFFSET, &value)) | ||
| 1426 | return 1; | ||
| 1427 | value = intel_pt_validate_cap(vmx->pt_desc.caps, | ||
| 1428 | PT_CAP_cycle_thresholds); | ||
| 1429 | if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) && | ||
| 1430 | !test_bit((data & RTIT_CTL_CYC_THRESH) >> | ||
| 1431 | RTIT_CTL_CYC_THRESH_OFFSET, &value)) | ||
| 1432 | return 1; | ||
| 1433 | value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_periods); | ||
| 1434 | if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) && | ||
| 1435 | !test_bit((data & RTIT_CTL_PSB_FREQ) >> | ||
| 1436 | RTIT_CTL_PSB_FREQ_OFFSET, &value)) | ||
| 1437 | return 1; | ||
| 1438 | |||
| 1439 | /* | ||
| 1440 | * If ADDRx_CFG is reserved or the encodings is >2 will | ||
| 1441 | * cause a #GP fault. | ||
| 1442 | */ | ||
| 1443 | value = (data & RTIT_CTL_ADDR0) >> RTIT_CTL_ADDR0_OFFSET; | ||
| 1444 | if ((value && (vmx->pt_desc.addr_range < 1)) || (value > 2)) | ||
| 1445 | return 1; | ||
| 1446 | value = (data & RTIT_CTL_ADDR1) >> RTIT_CTL_ADDR1_OFFSET; | ||
| 1447 | if ((value && (vmx->pt_desc.addr_range < 2)) || (value > 2)) | ||
| 1448 | return 1; | ||
| 1449 | value = (data & RTIT_CTL_ADDR2) >> RTIT_CTL_ADDR2_OFFSET; | ||
| 1450 | if ((value && (vmx->pt_desc.addr_range < 3)) || (value > 2)) | ||
| 1451 | return 1; | ||
| 1452 | value = (data & RTIT_CTL_ADDR3) >> RTIT_CTL_ADDR3_OFFSET; | ||
| 1453 | if ((value && (vmx->pt_desc.addr_range < 4)) || (value > 2)) | ||
| 1454 | return 1; | ||
| 1455 | |||
| 1456 | return 0; | ||
| 1457 | } | ||
| 1458 | |||
| 1459 | |||
| 1460 | static void skip_emulated_instruction(struct kvm_vcpu *vcpu) | ||
| 1461 | { | ||
| 1462 | unsigned long rip; | ||
| 1463 | |||
| 1464 | rip = kvm_rip_read(vcpu); | ||
| 1465 | rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN); | ||
| 1466 | kvm_rip_write(vcpu, rip); | ||
| 1467 | |||
| 1468 | /* skipping an emulated instruction also counts */ | ||
| 1469 | vmx_set_interrupt_shadow(vcpu, 0); | ||
| 1470 | } | ||
| 1471 | |||
| 1472 | static void vmx_clear_hlt(struct kvm_vcpu *vcpu) | ||
| 1473 | { | ||
| 1474 | /* | ||
| 1475 | * Ensure that we clear the HLT state in the VMCS. We don't need to | ||
| 1476 | * explicitly skip the instruction because if the HLT state is set, | ||
| 1477 | * then the instruction is already executing and RIP has already been | ||
| 1478 | * advanced. | ||
| 1479 | */ | ||
| 1480 | if (kvm_hlt_in_guest(vcpu->kvm) && | ||
| 1481 | vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT) | ||
| 1482 | vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); | ||
| 1483 | } | ||
| 1484 | |||
| 1485 | static void vmx_queue_exception(struct kvm_vcpu *vcpu) | ||
| 1486 | { | ||
| 1487 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 1488 | unsigned nr = vcpu->arch.exception.nr; | ||
| 1489 | bool has_error_code = vcpu->arch.exception.has_error_code; | ||
| 1490 | u32 error_code = vcpu->arch.exception.error_code; | ||
| 1491 | u32 intr_info = nr | INTR_INFO_VALID_MASK; | ||
| 1492 | |||
| 1493 | kvm_deliver_exception_payload(vcpu); | ||
| 1494 | |||
| 1495 | if (has_error_code) { | ||
| 1496 | vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code); | ||
| 1497 | intr_info |= INTR_INFO_DELIVER_CODE_MASK; | ||
| 1498 | } | ||
| 1499 | |||
| 1500 | if (vmx->rmode.vm86_active) { | ||
| 1501 | int inc_eip = 0; | ||
| 1502 | if (kvm_exception_is_soft(nr)) | ||
| 1503 | inc_eip = vcpu->arch.event_exit_inst_len; | ||
| 1504 | if (kvm_inject_realmode_interrupt(vcpu, nr, inc_eip) != EMULATE_DONE) | ||
| 1505 | kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); | ||
| 1506 | return; | ||
| 1507 | } | ||
| 1508 | |||
| 1509 | WARN_ON_ONCE(vmx->emulation_required); | ||
| 1510 | |||
| 1511 | if (kvm_exception_is_soft(nr)) { | ||
| 1512 | vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, | ||
| 1513 | vmx->vcpu.arch.event_exit_inst_len); | ||
| 1514 | intr_info |= INTR_TYPE_SOFT_EXCEPTION; | ||
| 1515 | } else | ||
| 1516 | intr_info |= INTR_TYPE_HARD_EXCEPTION; | ||
| 1517 | |||
| 1518 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); | ||
| 1519 | |||
| 1520 | vmx_clear_hlt(vcpu); | ||
| 1521 | } | ||
| 1522 | |||
| 1523 | static bool vmx_rdtscp_supported(void) | ||
| 1524 | { | ||
| 1525 | return cpu_has_vmx_rdtscp(); | ||
| 1526 | } | ||
| 1527 | |||
| 1528 | static bool vmx_invpcid_supported(void) | ||
| 1529 | { | ||
| 1530 | return cpu_has_vmx_invpcid(); | ||
| 1531 | } | ||
| 1532 | |||
| 1533 | /* | ||
| 1534 | * Swap MSR entry in host/guest MSR entry array. | ||
| 1535 | */ | ||
| 1536 | static void move_msr_up(struct vcpu_vmx *vmx, int from, int to) | ||
| 1537 | { | ||
| 1538 | struct shared_msr_entry tmp; | ||
| 1539 | |||
| 1540 | tmp = vmx->guest_msrs[to]; | ||
| 1541 | vmx->guest_msrs[to] = vmx->guest_msrs[from]; | ||
| 1542 | vmx->guest_msrs[from] = tmp; | ||
| 1543 | } | ||
| 1544 | |||
| 1545 | /* | ||
| 1546 | * Set up the vmcs to automatically save and restore system | ||
| 1547 | * msrs. Don't touch the 64-bit msrs if the guest is in legacy | ||
| 1548 | * mode, as fiddling with msrs is very expensive. | ||
| 1549 | */ | ||
| 1550 | static void setup_msrs(struct vcpu_vmx *vmx) | ||
| 1551 | { | ||
| 1552 | int save_nmsrs, index; | ||
| 1553 | |||
| 1554 | save_nmsrs = 0; | ||
| 1555 | #ifdef CONFIG_X86_64 | ||
| 1556 | /* | ||
| 1557 | * The SYSCALL MSRs are only needed on long mode guests, and only | ||
| 1558 | * when EFER.SCE is set. | ||
| 1559 | */ | ||
| 1560 | if (is_long_mode(&vmx->vcpu) && (vmx->vcpu.arch.efer & EFER_SCE)) { | ||
| 1561 | index = __find_msr_index(vmx, MSR_STAR); | ||
| 1562 | if (index >= 0) | ||
| 1563 | move_msr_up(vmx, index, save_nmsrs++); | ||
| 1564 | index = __find_msr_index(vmx, MSR_LSTAR); | ||
| 1565 | if (index >= 0) | ||
| 1566 | move_msr_up(vmx, index, save_nmsrs++); | ||
| 1567 | index = __find_msr_index(vmx, MSR_SYSCALL_MASK); | ||
| 1568 | if (index >= 0) | ||
| 1569 | move_msr_up(vmx, index, save_nmsrs++); | ||
| 1570 | } | ||
| 1571 | #endif | ||
| 1572 | index = __find_msr_index(vmx, MSR_EFER); | ||
| 1573 | if (index >= 0 && update_transition_efer(vmx, index)) | ||
| 1574 | move_msr_up(vmx, index, save_nmsrs++); | ||
| 1575 | index = __find_msr_index(vmx, MSR_TSC_AUX); | ||
| 1576 | if (index >= 0 && guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP)) | ||
| 1577 | move_msr_up(vmx, index, save_nmsrs++); | ||
| 1578 | |||
| 1579 | vmx->save_nmsrs = save_nmsrs; | ||
| 1580 | vmx->guest_msrs_dirty = true; | ||
| 1581 | |||
| 1582 | if (cpu_has_vmx_msr_bitmap()) | ||
| 1583 | vmx_update_msr_bitmap(&vmx->vcpu); | ||
| 1584 | } | ||
| 1585 | |||
| 1586 | static u64 vmx_read_l1_tsc_offset(struct kvm_vcpu *vcpu) | ||
| 1587 | { | ||
| 1588 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); | ||
| 1589 | |||
| 1590 | if (is_guest_mode(vcpu) && | ||
| 1591 | (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)) | ||
| 1592 | return vcpu->arch.tsc_offset - vmcs12->tsc_offset; | ||
| 1593 | |||
| 1594 | return vcpu->arch.tsc_offset; | ||
| 1595 | } | ||
| 1596 | |||
| 1597 | static u64 vmx_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) | ||
| 1598 | { | ||
| 1599 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); | ||
| 1600 | u64 g_tsc_offset = 0; | ||
| 1601 | |||
| 1602 | /* | ||
| 1603 | * We're here if L1 chose not to trap WRMSR to TSC. According | ||
| 1604 | * to the spec, this should set L1's TSC; The offset that L1 | ||
| 1605 | * set for L2 remains unchanged, and still needs to be added | ||
| 1606 | * to the newly set TSC to get L2's TSC. | ||
| 1607 | */ | ||
| 1608 | if (is_guest_mode(vcpu) && | ||
| 1609 | (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)) | ||
| 1610 | g_tsc_offset = vmcs12->tsc_offset; | ||
| 1611 | |||
| 1612 | trace_kvm_write_tsc_offset(vcpu->vcpu_id, | ||
| 1613 | vcpu->arch.tsc_offset - g_tsc_offset, | ||
| 1614 | offset); | ||
| 1615 | vmcs_write64(TSC_OFFSET, offset + g_tsc_offset); | ||
| 1616 | return offset + g_tsc_offset; | ||
| 1617 | } | ||
| 1618 | |||
| 1619 | /* | ||
| 1620 | * nested_vmx_allowed() checks whether a guest should be allowed to use VMX | ||
| 1621 | * instructions and MSRs (i.e., nested VMX). Nested VMX is disabled for | ||
| 1622 | * all guests if the "nested" module option is off, and can also be disabled | ||
| 1623 | * for a single guest by disabling its VMX cpuid bit. | ||
| 1624 | */ | ||
| 1625 | bool nested_vmx_allowed(struct kvm_vcpu *vcpu) | ||
| 1626 | { | ||
| 1627 | return nested && guest_cpuid_has(vcpu, X86_FEATURE_VMX); | ||
| 1628 | } | ||
| 1629 | |||
| 1630 | static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu, | ||
| 1631 | uint64_t val) | ||
| 1632 | { | ||
| 1633 | uint64_t valid_bits = to_vmx(vcpu)->msr_ia32_feature_control_valid_bits; | ||
| 1634 | |||
| 1635 | return !(val & ~valid_bits); | ||
| 1636 | } | ||
| 1637 | |||
| 1638 | static int vmx_get_msr_feature(struct kvm_msr_entry *msr) | ||
| 1639 | { | ||
| 1640 | switch (msr->index) { | ||
| 1641 | case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: | ||
| 1642 | if (!nested) | ||
| 1643 | return 1; | ||
| 1644 | return vmx_get_vmx_msr(&vmcs_config.nested, msr->index, &msr->data); | ||
| 1645 | default: | ||
| 1646 | return 1; | ||
| 1647 | } | ||
| 1648 | |||
| 1649 | return 0; | ||
| 1650 | } | ||
| 1651 | |||
| 1652 | /* | ||
| 1653 | * Reads an msr value (of 'msr_index') into 'pdata'. | ||
| 1654 | * Returns 0 on success, non-0 otherwise. | ||
| 1655 | * Assumes vcpu_load() was already called. | ||
| 1656 | */ | ||
| 1657 | static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) | ||
| 1658 | { | ||
| 1659 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 1660 | struct shared_msr_entry *msr; | ||
| 1661 | u32 index; | ||
| 1662 | |||
| 1663 | switch (msr_info->index) { | ||
| 1664 | #ifdef CONFIG_X86_64 | ||
| 1665 | case MSR_FS_BASE: | ||
| 1666 | msr_info->data = vmcs_readl(GUEST_FS_BASE); | ||
| 1667 | break; | ||
| 1668 | case MSR_GS_BASE: | ||
| 1669 | msr_info->data = vmcs_readl(GUEST_GS_BASE); | ||
| 1670 | break; | ||
| 1671 | case MSR_KERNEL_GS_BASE: | ||
| 1672 | msr_info->data = vmx_read_guest_kernel_gs_base(vmx); | ||
| 1673 | break; | ||
| 1674 | #endif | ||
| 1675 | case MSR_EFER: | ||
| 1676 | return kvm_get_msr_common(vcpu, msr_info); | ||
| 1677 | case MSR_IA32_SPEC_CTRL: | ||
| 1678 | if (!msr_info->host_initiated && | ||
| 1679 | !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) | ||
| 1680 | return 1; | ||
| 1681 | |||
| 1682 | msr_info->data = to_vmx(vcpu)->spec_ctrl; | ||
| 1683 | break; | ||
| 1684 | case MSR_IA32_ARCH_CAPABILITIES: | ||
| 1685 | if (!msr_info->host_initiated && | ||
| 1686 | !guest_cpuid_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES)) | ||
| 1687 | return 1; | ||
| 1688 | msr_info->data = to_vmx(vcpu)->arch_capabilities; | ||
| 1689 | break; | ||
| 1690 | case MSR_IA32_SYSENTER_CS: | ||
| 1691 | msr_info->data = vmcs_read32(GUEST_SYSENTER_CS); | ||
| 1692 | break; | ||
| 1693 | case MSR_IA32_SYSENTER_EIP: | ||
| 1694 | msr_info->data = vmcs_readl(GUEST_SYSENTER_EIP); | ||
| 1695 | break; | ||
| 1696 | case MSR_IA32_SYSENTER_ESP: | ||
| 1697 | msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP); | ||
| 1698 | break; | ||
| 1699 | case MSR_IA32_BNDCFGS: | ||
| 1700 | if (!kvm_mpx_supported() || | ||
| 1701 | (!msr_info->host_initiated && | ||
| 1702 | !guest_cpuid_has(vcpu, X86_FEATURE_MPX))) | ||
| 1703 | return 1; | ||
| 1704 | msr_info->data = vmcs_read64(GUEST_BNDCFGS); | ||
| 1705 | break; | ||
| 1706 | case MSR_IA32_MCG_EXT_CTL: | ||
| 1707 | if (!msr_info->host_initiated && | ||
| 1708 | !(vmx->msr_ia32_feature_control & | ||
| 1709 | FEATURE_CONTROL_LMCE)) | ||
| 1710 | return 1; | ||
| 1711 | msr_info->data = vcpu->arch.mcg_ext_ctl; | ||
| 1712 | break; | ||
| 1713 | case MSR_IA32_FEATURE_CONTROL: | ||
| 1714 | msr_info->data = vmx->msr_ia32_feature_control; | ||
| 1715 | break; | ||
| 1716 | case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: | ||
| 1717 | if (!nested_vmx_allowed(vcpu)) | ||
| 1718 | return 1; | ||
| 1719 | return vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index, | ||
| 1720 | &msr_info->data); | ||
| 1721 | case MSR_IA32_XSS: | ||
| 1722 | if (!vmx_xsaves_supported()) | ||
| 1723 | return 1; | ||
| 1724 | msr_info->data = vcpu->arch.ia32_xss; | ||
| 1725 | break; | ||
| 1726 | case MSR_IA32_RTIT_CTL: | ||
| 1727 | if (pt_mode != PT_MODE_HOST_GUEST) | ||
| 1728 | return 1; | ||
| 1729 | msr_info->data = vmx->pt_desc.guest.ctl; | ||
| 1730 | break; | ||
| 1731 | case MSR_IA32_RTIT_STATUS: | ||
| 1732 | if (pt_mode != PT_MODE_HOST_GUEST) | ||
| 1733 | return 1; | ||
| 1734 | msr_info->data = vmx->pt_desc.guest.status; | ||
| 1735 | break; | ||
| 1736 | case MSR_IA32_RTIT_CR3_MATCH: | ||
| 1737 | if ((pt_mode != PT_MODE_HOST_GUEST) || | ||
| 1738 | !intel_pt_validate_cap(vmx->pt_desc.caps, | ||
| 1739 | PT_CAP_cr3_filtering)) | ||
| 1740 | return 1; | ||
| 1741 | msr_info->data = vmx->pt_desc.guest.cr3_match; | ||
| 1742 | break; | ||
| 1743 | case MSR_IA32_RTIT_OUTPUT_BASE: | ||
| 1744 | if ((pt_mode != PT_MODE_HOST_GUEST) || | ||
| 1745 | (!intel_pt_validate_cap(vmx->pt_desc.caps, | ||
| 1746 | PT_CAP_topa_output) && | ||
| 1747 | !intel_pt_validate_cap(vmx->pt_desc.caps, | ||
| 1748 | PT_CAP_single_range_output))) | ||
| 1749 | return 1; | ||
| 1750 | msr_info->data = vmx->pt_desc.guest.output_base; | ||
| 1751 | break; | ||
| 1752 | case MSR_IA32_RTIT_OUTPUT_MASK: | ||
| 1753 | if ((pt_mode != PT_MODE_HOST_GUEST) || | ||
| 1754 | (!intel_pt_validate_cap(vmx->pt_desc.caps, | ||
| 1755 | PT_CAP_topa_output) && | ||
| 1756 | !intel_pt_validate_cap(vmx->pt_desc.caps, | ||
| 1757 | PT_CAP_single_range_output))) | ||
| 1758 | return 1; | ||
| 1759 | msr_info->data = vmx->pt_desc.guest.output_mask; | ||
| 1760 | break; | ||
| 1761 | case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: | ||
| 1762 | index = msr_info->index - MSR_IA32_RTIT_ADDR0_A; | ||
| 1763 | if ((pt_mode != PT_MODE_HOST_GUEST) || | ||
| 1764 | (index >= 2 * intel_pt_validate_cap(vmx->pt_desc.caps, | ||
| 1765 | PT_CAP_num_address_ranges))) | ||
| 1766 | return 1; | ||
| 1767 | if (index % 2) | ||
| 1768 | msr_info->data = vmx->pt_desc.guest.addr_b[index / 2]; | ||
| 1769 | else | ||
| 1770 | msr_info->data = vmx->pt_desc.guest.addr_a[index / 2]; | ||
| 1771 | break; | ||
| 1772 | case MSR_TSC_AUX: | ||
| 1773 | if (!msr_info->host_initiated && | ||
| 1774 | !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP)) | ||
| 1775 | return 1; | ||
| 1776 | /* Otherwise falls through */ | ||
| 1777 | default: | ||
| 1778 | msr = find_msr_entry(vmx, msr_info->index); | ||
| 1779 | if (msr) { | ||
| 1780 | msr_info->data = msr->data; | ||
| 1781 | break; | ||
| 1782 | } | ||
| 1783 | return kvm_get_msr_common(vcpu, msr_info); | ||
| 1784 | } | ||
| 1785 | |||
| 1786 | return 0; | ||
| 1787 | } | ||
| 1788 | |||
| 1789 | /* | ||
| 1790 | * Writes msr value into into the appropriate "register". | ||
| 1791 | * Returns 0 on success, non-0 otherwise. | ||
| 1792 | * Assumes vcpu_load() was already called. | ||
| 1793 | */ | ||
| 1794 | static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) | ||
| 1795 | { | ||
| 1796 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 1797 | struct shared_msr_entry *msr; | ||
| 1798 | int ret = 0; | ||
| 1799 | u32 msr_index = msr_info->index; | ||
| 1800 | u64 data = msr_info->data; | ||
| 1801 | u32 index; | ||
| 1802 | |||
| 1803 | switch (msr_index) { | ||
| 1804 | case MSR_EFER: | ||
| 1805 | ret = kvm_set_msr_common(vcpu, msr_info); | ||
| 1806 | break; | ||
| 1807 | #ifdef CONFIG_X86_64 | ||
| 1808 | case MSR_FS_BASE: | ||
| 1809 | vmx_segment_cache_clear(vmx); | ||
| 1810 | vmcs_writel(GUEST_FS_BASE, data); | ||
| 1811 | break; | ||
| 1812 | case MSR_GS_BASE: | ||
| 1813 | vmx_segment_cache_clear(vmx); | ||
| 1814 | vmcs_writel(GUEST_GS_BASE, data); | ||
| 1815 | break; | ||
| 1816 | case MSR_KERNEL_GS_BASE: | ||
| 1817 | vmx_write_guest_kernel_gs_base(vmx, data); | ||
| 1818 | break; | ||
| 1819 | #endif | ||
| 1820 | case MSR_IA32_SYSENTER_CS: | ||
| 1821 | vmcs_write32(GUEST_SYSENTER_CS, data); | ||
| 1822 | break; | ||
| 1823 | case MSR_IA32_SYSENTER_EIP: | ||
| 1824 | vmcs_writel(GUEST_SYSENTER_EIP, data); | ||
| 1825 | break; | ||
| 1826 | case MSR_IA32_SYSENTER_ESP: | ||
| 1827 | vmcs_writel(GUEST_SYSENTER_ESP, data); | ||
| 1828 | break; | ||
| 1829 | case MSR_IA32_BNDCFGS: | ||
| 1830 | if (!kvm_mpx_supported() || | ||
| 1831 | (!msr_info->host_initiated && | ||
| 1832 | !guest_cpuid_has(vcpu, X86_FEATURE_MPX))) | ||
| 1833 | return 1; | ||
| 1834 | if (is_noncanonical_address(data & PAGE_MASK, vcpu) || | ||
| 1835 | (data & MSR_IA32_BNDCFGS_RSVD)) | ||
| 1836 | return 1; | ||
| 1837 | vmcs_write64(GUEST_BNDCFGS, data); | ||
| 1838 | break; | ||
| 1839 | case MSR_IA32_SPEC_CTRL: | ||
| 1840 | if (!msr_info->host_initiated && | ||
| 1841 | !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) | ||
| 1842 | return 1; | ||
| 1843 | |||
| 1844 | /* The STIBP bit doesn't fault even if it's not advertised */ | ||
| 1845 | if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD)) | ||
| 1846 | return 1; | ||
| 1847 | |||
| 1848 | vmx->spec_ctrl = data; | ||
| 1849 | |||
| 1850 | if (!data) | ||
| 1851 | break; | ||
| 1852 | |||
| 1853 | /* | ||
| 1854 | * For non-nested: | ||
| 1855 | * When it's written (to non-zero) for the first time, pass | ||
| 1856 | * it through. | ||
| 1857 | * | ||
| 1858 | * For nested: | ||
| 1859 | * The handling of the MSR bitmap for L2 guests is done in | ||
| 1860 | * nested_vmx_merge_msr_bitmap. We should not touch the | ||
| 1861 | * vmcs02.msr_bitmap here since it gets completely overwritten | ||
| 1862 | * in the merging. We update the vmcs01 here for L1 as well | ||
| 1863 | * since it will end up touching the MSR anyway now. | ||
| 1864 | */ | ||
| 1865 | vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, | ||
| 1866 | MSR_IA32_SPEC_CTRL, | ||
| 1867 | MSR_TYPE_RW); | ||
| 1868 | break; | ||
| 1869 | case MSR_IA32_PRED_CMD: | ||
| 1870 | if (!msr_info->host_initiated && | ||
| 1871 | !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) | ||
| 1872 | return 1; | ||
| 1873 | |||
| 1874 | if (data & ~PRED_CMD_IBPB) | ||
| 1875 | return 1; | ||
| 1876 | |||
| 1877 | if (!data) | ||
| 1878 | break; | ||
| 1879 | |||
| 1880 | wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB); | ||
| 1881 | |||
| 1882 | /* | ||
| 1883 | * For non-nested: | ||
| 1884 | * When it's written (to non-zero) for the first time, pass | ||
| 1885 | * it through. | ||
| 1886 | * | ||
| 1887 | * For nested: | ||
| 1888 | * The handling of the MSR bitmap for L2 guests is done in | ||
| 1889 | * nested_vmx_merge_msr_bitmap. We should not touch the | ||
| 1890 | * vmcs02.msr_bitmap here since it gets completely overwritten | ||
| 1891 | * in the merging. | ||
| 1892 | */ | ||
| 1893 | vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, MSR_IA32_PRED_CMD, | ||
| 1894 | MSR_TYPE_W); | ||
| 1895 | break; | ||
| 1896 | case MSR_IA32_ARCH_CAPABILITIES: | ||
| 1897 | if (!msr_info->host_initiated) | ||
| 1898 | return 1; | ||
| 1899 | vmx->arch_capabilities = data; | ||
| 1900 | break; | ||
| 1901 | case MSR_IA32_CR_PAT: | ||
| 1902 | if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { | ||
| 1903 | if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data)) | ||
| 1904 | return 1; | ||
| 1905 | vmcs_write64(GUEST_IA32_PAT, data); | ||
| 1906 | vcpu->arch.pat = data; | ||
| 1907 | break; | ||
| 1908 | } | ||
| 1909 | ret = kvm_set_msr_common(vcpu, msr_info); | ||
| 1910 | break; | ||
| 1911 | case MSR_IA32_TSC_ADJUST: | ||
| 1912 | ret = kvm_set_msr_common(vcpu, msr_info); | ||
| 1913 | break; | ||
| 1914 | case MSR_IA32_MCG_EXT_CTL: | ||
| 1915 | if ((!msr_info->host_initiated && | ||
| 1916 | !(to_vmx(vcpu)->msr_ia32_feature_control & | ||
| 1917 | FEATURE_CONTROL_LMCE)) || | ||
| 1918 | (data & ~MCG_EXT_CTL_LMCE_EN)) | ||
| 1919 | return 1; | ||
| 1920 | vcpu->arch.mcg_ext_ctl = data; | ||
| 1921 | break; | ||
| 1922 | case MSR_IA32_FEATURE_CONTROL: | ||
| 1923 | if (!vmx_feature_control_msr_valid(vcpu, data) || | ||
| 1924 | (to_vmx(vcpu)->msr_ia32_feature_control & | ||
| 1925 | FEATURE_CONTROL_LOCKED && !msr_info->host_initiated)) | ||
| 1926 | return 1; | ||
| 1927 | vmx->msr_ia32_feature_control = data; | ||
| 1928 | if (msr_info->host_initiated && data == 0) | ||
| 1929 | vmx_leave_nested(vcpu); | ||
| 1930 | break; | ||
| 1931 | case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: | ||
| 1932 | if (!msr_info->host_initiated) | ||
| 1933 | return 1; /* they are read-only */ | ||
| 1934 | if (!nested_vmx_allowed(vcpu)) | ||
| 1935 | return 1; | ||
| 1936 | return vmx_set_vmx_msr(vcpu, msr_index, data); | ||
| 1937 | case MSR_IA32_XSS: | ||
| 1938 | if (!vmx_xsaves_supported()) | ||
| 1939 | return 1; | ||
| 1940 | /* | ||
| 1941 | * The only supported bit as of Skylake is bit 8, but | ||
| 1942 | * it is not supported on KVM. | ||
| 1943 | */ | ||
| 1944 | if (data != 0) | ||
| 1945 | return 1; | ||
| 1946 | vcpu->arch.ia32_xss = data; | ||
| 1947 | if (vcpu->arch.ia32_xss != host_xss) | ||
| 1948 | add_atomic_switch_msr(vmx, MSR_IA32_XSS, | ||
| 1949 | vcpu->arch.ia32_xss, host_xss, false); | ||
| 1950 | else | ||
| 1951 | clear_atomic_switch_msr(vmx, MSR_IA32_XSS); | ||
| 1952 | break; | ||
| 1953 | case MSR_IA32_RTIT_CTL: | ||
| 1954 | if ((pt_mode != PT_MODE_HOST_GUEST) || | ||
| 1955 | vmx_rtit_ctl_check(vcpu, data) || | ||
| 1956 | vmx->nested.vmxon) | ||
| 1957 | return 1; | ||
| 1958 | vmcs_write64(GUEST_IA32_RTIT_CTL, data); | ||
| 1959 | vmx->pt_desc.guest.ctl = data; | ||
| 1960 | pt_update_intercept_for_msr(vmx); | ||
| 1961 | break; | ||
| 1962 | case MSR_IA32_RTIT_STATUS: | ||
| 1963 | if ((pt_mode != PT_MODE_HOST_GUEST) || | ||
| 1964 | (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) || | ||
| 1965 | (data & MSR_IA32_RTIT_STATUS_MASK)) | ||
| 1966 | return 1; | ||
| 1967 | vmx->pt_desc.guest.status = data; | ||
| 1968 | break; | ||
| 1969 | case MSR_IA32_RTIT_CR3_MATCH: | ||
| 1970 | if ((pt_mode != PT_MODE_HOST_GUEST) || | ||
| 1971 | (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) || | ||
| 1972 | !intel_pt_validate_cap(vmx->pt_desc.caps, | ||
| 1973 | PT_CAP_cr3_filtering)) | ||
| 1974 | return 1; | ||
| 1975 | vmx->pt_desc.guest.cr3_match = data; | ||
| 1976 | break; | ||
| 1977 | case MSR_IA32_RTIT_OUTPUT_BASE: | ||
| 1978 | if ((pt_mode != PT_MODE_HOST_GUEST) || | ||
| 1979 | (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) || | ||
| 1980 | (!intel_pt_validate_cap(vmx->pt_desc.caps, | ||
| 1981 | PT_CAP_topa_output) && | ||
| 1982 | !intel_pt_validate_cap(vmx->pt_desc.caps, | ||
| 1983 | PT_CAP_single_range_output)) || | ||
| 1984 | (data & MSR_IA32_RTIT_OUTPUT_BASE_MASK)) | ||
| 1985 | return 1; | ||
| 1986 | vmx->pt_desc.guest.output_base = data; | ||
| 1987 | break; | ||
| 1988 | case MSR_IA32_RTIT_OUTPUT_MASK: | ||
| 1989 | if ((pt_mode != PT_MODE_HOST_GUEST) || | ||
| 1990 | (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) || | ||
| 1991 | (!intel_pt_validate_cap(vmx->pt_desc.caps, | ||
| 1992 | PT_CAP_topa_output) && | ||
| 1993 | !intel_pt_validate_cap(vmx->pt_desc.caps, | ||
| 1994 | PT_CAP_single_range_output))) | ||
| 1995 | return 1; | ||
| 1996 | vmx->pt_desc.guest.output_mask = data; | ||
| 1997 | break; | ||
| 1998 | case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: | ||
| 1999 | index = msr_info->index - MSR_IA32_RTIT_ADDR0_A; | ||
| 2000 | if ((pt_mode != PT_MODE_HOST_GUEST) || | ||
| 2001 | (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) || | ||
| 2002 | (index >= 2 * intel_pt_validate_cap(vmx->pt_desc.caps, | ||
| 2003 | PT_CAP_num_address_ranges))) | ||
| 2004 | return 1; | ||
| 2005 | if (index % 2) | ||
| 2006 | vmx->pt_desc.guest.addr_b[index / 2] = data; | ||
| 2007 | else | ||
| 2008 | vmx->pt_desc.guest.addr_a[index / 2] = data; | ||
| 2009 | break; | ||
| 2010 | case MSR_TSC_AUX: | ||
| 2011 | if (!msr_info->host_initiated && | ||
| 2012 | !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP)) | ||
| 2013 | return 1; | ||
| 2014 | /* Check reserved bit, higher 32 bits should be zero */ | ||
| 2015 | if ((data >> 32) != 0) | ||
| 2016 | return 1; | ||
| 2017 | /* Otherwise falls through */ | ||
| 2018 | default: | ||
| 2019 | msr = find_msr_entry(vmx, msr_index); | ||
| 2020 | if (msr) { | ||
| 2021 | u64 old_msr_data = msr->data; | ||
| 2022 | msr->data = data; | ||
| 2023 | if (msr - vmx->guest_msrs < vmx->save_nmsrs) { | ||
| 2024 | preempt_disable(); | ||
| 2025 | ret = kvm_set_shared_msr(msr->index, msr->data, | ||
| 2026 | msr->mask); | ||
| 2027 | preempt_enable(); | ||
| 2028 | if (ret) | ||
| 2029 | msr->data = old_msr_data; | ||
| 2030 | } | ||
| 2031 | break; | ||
| 2032 | } | ||
| 2033 | ret = kvm_set_msr_common(vcpu, msr_info); | ||
| 2034 | } | ||
| 2035 | |||
| 2036 | return ret; | ||
| 2037 | } | ||
| 2038 | |||
| 2039 | static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) | ||
| 2040 | { | ||
| 2041 | __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail); | ||
| 2042 | switch (reg) { | ||
| 2043 | case VCPU_REGS_RSP: | ||
| 2044 | vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP); | ||
| 2045 | break; | ||
| 2046 | case VCPU_REGS_RIP: | ||
| 2047 | vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP); | ||
| 2048 | break; | ||
| 2049 | case VCPU_EXREG_PDPTR: | ||
| 2050 | if (enable_ept) | ||
| 2051 | ept_save_pdptrs(vcpu); | ||
| 2052 | break; | ||
| 2053 | default: | ||
| 2054 | break; | ||
| 2055 | } | ||
| 2056 | } | ||
| 2057 | |||
| 2058 | static __init int cpu_has_kvm_support(void) | ||
| 2059 | { | ||
| 2060 | return cpu_has_vmx(); | ||
| 2061 | } | ||
| 2062 | |||
| 2063 | static __init int vmx_disabled_by_bios(void) | ||
| 2064 | { | ||
| 2065 | u64 msr; | ||
| 2066 | |||
| 2067 | rdmsrl(MSR_IA32_FEATURE_CONTROL, msr); | ||
| 2068 | if (msr & FEATURE_CONTROL_LOCKED) { | ||
| 2069 | /* launched w/ TXT and VMX disabled */ | ||
| 2070 | if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX) | ||
| 2071 | && tboot_enabled()) | ||
| 2072 | return 1; | ||
| 2073 | /* launched w/o TXT and VMX only enabled w/ TXT */ | ||
| 2074 | if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX) | ||
| 2075 | && (msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX) | ||
| 2076 | && !tboot_enabled()) { | ||
| 2077 | printk(KERN_WARNING "kvm: disable TXT in the BIOS or " | ||
| 2078 | "activate TXT before enabling KVM\n"); | ||
| 2079 | return 1; | ||
| 2080 | } | ||
| 2081 | /* launched w/o TXT and VMX disabled */ | ||
| 2082 | if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX) | ||
| 2083 | && !tboot_enabled()) | ||
| 2084 | return 1; | ||
| 2085 | } | ||
| 2086 | |||
| 2087 | return 0; | ||
| 2088 | } | ||
| 2089 | |||
| 2090 | static void kvm_cpu_vmxon(u64 addr) | ||
| 2091 | { | ||
| 2092 | cr4_set_bits(X86_CR4_VMXE); | ||
| 2093 | intel_pt_handle_vmx(1); | ||
| 2094 | |||
| 2095 | asm volatile ("vmxon %0" : : "m"(addr)); | ||
| 2096 | } | ||
| 2097 | |||
| 2098 | static int hardware_enable(void) | ||
| 2099 | { | ||
| 2100 | int cpu = raw_smp_processor_id(); | ||
| 2101 | u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); | ||
| 2102 | u64 old, test_bits; | ||
| 2103 | |||
| 2104 | if (cr4_read_shadow() & X86_CR4_VMXE) | ||
| 2105 | return -EBUSY; | ||
| 2106 | |||
| 2107 | /* | ||
| 2108 | * This can happen if we hot-added a CPU but failed to allocate | ||
| 2109 | * VP assist page for it. | ||
| 2110 | */ | ||
| 2111 | if (static_branch_unlikely(&enable_evmcs) && | ||
| 2112 | !hv_get_vp_assist_page(cpu)) | ||
| 2113 | return -EFAULT; | ||
| 2114 | |||
| 2115 | INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu)); | ||
| 2116 | INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu)); | ||
| 2117 | spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); | ||
| 2118 | |||
| 2119 | /* | ||
| 2120 | * Now we can enable the vmclear operation in kdump | ||
| 2121 | * since the loaded_vmcss_on_cpu list on this cpu | ||
| 2122 | * has been initialized. | ||
| 2123 | * | ||
| 2124 | * Though the cpu is not in VMX operation now, there | ||
| 2125 | * is no problem to enable the vmclear operation | ||
| 2126 | * for the loaded_vmcss_on_cpu list is empty! | ||
| 2127 | */ | ||
| 2128 | crash_enable_local_vmclear(cpu); | ||
| 2129 | |||
| 2130 | rdmsrl(MSR_IA32_FEATURE_CONTROL, old); | ||
| 2131 | |||
| 2132 | test_bits = FEATURE_CONTROL_LOCKED; | ||
| 2133 | test_bits |= FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; | ||
| 2134 | if (tboot_enabled()) | ||
| 2135 | test_bits |= FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX; | ||
| 2136 | |||
| 2137 | if ((old & test_bits) != test_bits) { | ||
| 2138 | /* enable and lock */ | ||
| 2139 | wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits); | ||
| 2140 | } | ||
| 2141 | kvm_cpu_vmxon(phys_addr); | ||
| 2142 | if (enable_ept) | ||
| 2143 | ept_sync_global(); | ||
| 2144 | |||
| 2145 | return 0; | ||
| 2146 | } | ||
| 2147 | |||
| 2148 | static void vmclear_local_loaded_vmcss(void) | ||
| 2149 | { | ||
| 2150 | int cpu = raw_smp_processor_id(); | ||
| 2151 | struct loaded_vmcs *v, *n; | ||
| 2152 | |||
| 2153 | list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu), | ||
| 2154 | loaded_vmcss_on_cpu_link) | ||
| 2155 | __loaded_vmcs_clear(v); | ||
| 2156 | } | ||
| 2157 | |||
| 2158 | |||
| 2159 | /* Just like cpu_vmxoff(), but with the __kvm_handle_fault_on_reboot() | ||
| 2160 | * tricks. | ||
| 2161 | */ | ||
| 2162 | static void kvm_cpu_vmxoff(void) | ||
| 2163 | { | ||
| 2164 | asm volatile (__ex("vmxoff")); | ||
| 2165 | |||
| 2166 | intel_pt_handle_vmx(0); | ||
| 2167 | cr4_clear_bits(X86_CR4_VMXE); | ||
| 2168 | } | ||
| 2169 | |||
| 2170 | static void hardware_disable(void) | ||
| 2171 | { | ||
| 2172 | vmclear_local_loaded_vmcss(); | ||
| 2173 | kvm_cpu_vmxoff(); | ||
| 2174 | } | ||
| 2175 | |||
| 2176 | static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, | ||
| 2177 | u32 msr, u32 *result) | ||
| 2178 | { | ||
| 2179 | u32 vmx_msr_low, vmx_msr_high; | ||
| 2180 | u32 ctl = ctl_min | ctl_opt; | ||
| 2181 | |||
| 2182 | rdmsr(msr, vmx_msr_low, vmx_msr_high); | ||
| 2183 | |||
| 2184 | ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */ | ||
| 2185 | ctl |= vmx_msr_low; /* bit == 1 in low word ==> must be one */ | ||
| 2186 | |||
| 2187 | /* Ensure minimum (required) set of control bits are supported. */ | ||
| 2188 | if (ctl_min & ~ctl) | ||
| 2189 | return -EIO; | ||
| 2190 | |||
| 2191 | *result = ctl; | ||
| 2192 | return 0; | ||
| 2193 | } | ||
| 2194 | |||
| 2195 | static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, | ||
| 2196 | struct vmx_capability *vmx_cap) | ||
| 2197 | { | ||
| 2198 | u32 vmx_msr_low, vmx_msr_high; | ||
| 2199 | u32 min, opt, min2, opt2; | ||
| 2200 | u32 _pin_based_exec_control = 0; | ||
| 2201 | u32 _cpu_based_exec_control = 0; | ||
| 2202 | u32 _cpu_based_2nd_exec_control = 0; | ||
| 2203 | u32 _vmexit_control = 0; | ||
| 2204 | u32 _vmentry_control = 0; | ||
| 2205 | |||
| 2206 | memset(vmcs_conf, 0, sizeof(*vmcs_conf)); | ||
| 2207 | min = CPU_BASED_HLT_EXITING | | ||
| 2208 | #ifdef CONFIG_X86_64 | ||
| 2209 | CPU_BASED_CR8_LOAD_EXITING | | ||
| 2210 | CPU_BASED_CR8_STORE_EXITING | | ||
| 2211 | #endif | ||
| 2212 | CPU_BASED_CR3_LOAD_EXITING | | ||
| 2213 | CPU_BASED_CR3_STORE_EXITING | | ||
| 2214 | CPU_BASED_UNCOND_IO_EXITING | | ||
| 2215 | CPU_BASED_MOV_DR_EXITING | | ||
| 2216 | CPU_BASED_USE_TSC_OFFSETING | | ||
| 2217 | CPU_BASED_MWAIT_EXITING | | ||
| 2218 | CPU_BASED_MONITOR_EXITING | | ||
| 2219 | CPU_BASED_INVLPG_EXITING | | ||
| 2220 | CPU_BASED_RDPMC_EXITING; | ||
| 2221 | |||
| 2222 | opt = CPU_BASED_TPR_SHADOW | | ||
| 2223 | CPU_BASED_USE_MSR_BITMAPS | | ||
| 2224 | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; | ||
| 2225 | if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS, | ||
| 2226 | &_cpu_based_exec_control) < 0) | ||
| 2227 | return -EIO; | ||
| 2228 | #ifdef CONFIG_X86_64 | ||
| 2229 | if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW)) | ||
| 2230 | _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING & | ||
| 2231 | ~CPU_BASED_CR8_STORE_EXITING; | ||
| 2232 | #endif | ||
| 2233 | if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) { | ||
| 2234 | min2 = 0; | ||
| 2235 | opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | | ||
| 2236 | SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | | ||
| 2237 | SECONDARY_EXEC_WBINVD_EXITING | | ||
| 2238 | SECONDARY_EXEC_ENABLE_VPID | | ||
| 2239 | SECONDARY_EXEC_ENABLE_EPT | | ||
| 2240 | SECONDARY_EXEC_UNRESTRICTED_GUEST | | ||
| 2241 | SECONDARY_EXEC_PAUSE_LOOP_EXITING | | ||
| 2242 | SECONDARY_EXEC_DESC | | ||
| 2243 | SECONDARY_EXEC_RDTSCP | | ||
| 2244 | SECONDARY_EXEC_ENABLE_INVPCID | | ||
| 2245 | SECONDARY_EXEC_APIC_REGISTER_VIRT | | ||
| 2246 | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | | ||
| 2247 | SECONDARY_EXEC_SHADOW_VMCS | | ||
| 2248 | SECONDARY_EXEC_XSAVES | | ||
| 2249 | SECONDARY_EXEC_RDSEED_EXITING | | ||
| 2250 | SECONDARY_EXEC_RDRAND_EXITING | | ||
| 2251 | SECONDARY_EXEC_ENABLE_PML | | ||
| 2252 | SECONDARY_EXEC_TSC_SCALING | | ||
| 2253 | SECONDARY_EXEC_PT_USE_GPA | | ||
| 2254 | SECONDARY_EXEC_PT_CONCEAL_VMX | | ||
| 2255 | SECONDARY_EXEC_ENABLE_VMFUNC | | ||
| 2256 | SECONDARY_EXEC_ENCLS_EXITING; | ||
| 2257 | if (adjust_vmx_controls(min2, opt2, | ||
| 2258 | MSR_IA32_VMX_PROCBASED_CTLS2, | ||
| 2259 | &_cpu_based_2nd_exec_control) < 0) | ||
| 2260 | return -EIO; | ||
| 2261 | } | ||
| 2262 | #ifndef CONFIG_X86_64 | ||
| 2263 | if (!(_cpu_based_2nd_exec_control & | ||
| 2264 | SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) | ||
| 2265 | _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW; | ||
| 2266 | #endif | ||
| 2267 | |||
| 2268 | if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW)) | ||
| 2269 | _cpu_based_2nd_exec_control &= ~( | ||
| 2270 | SECONDARY_EXEC_APIC_REGISTER_VIRT | | ||
| 2271 | SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | | ||
| 2272 | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); | ||
| 2273 | |||
| 2274 | rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP, | ||
| 2275 | &vmx_cap->ept, &vmx_cap->vpid); | ||
| 2276 | |||
| 2277 | if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) { | ||
| 2278 | /* CR3 accesses and invlpg don't need to cause VM Exits when EPT | ||
| 2279 | enabled */ | ||
| 2280 | _cpu_based_exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING | | ||
| 2281 | CPU_BASED_CR3_STORE_EXITING | | ||
| 2282 | CPU_BASED_INVLPG_EXITING); | ||
| 2283 | } else if (vmx_cap->ept) { | ||
| 2284 | vmx_cap->ept = 0; | ||
| 2285 | pr_warn_once("EPT CAP should not exist if not support " | ||
| 2286 | "1-setting enable EPT VM-execution control\n"); | ||
| 2287 | } | ||
| 2288 | if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) && | ||
| 2289 | vmx_cap->vpid) { | ||
| 2290 | vmx_cap->vpid = 0; | ||
| 2291 | pr_warn_once("VPID CAP should not exist if not support " | ||
| 2292 | "1-setting enable VPID VM-execution control\n"); | ||
| 2293 | } | ||
| 2294 | |||
| 2295 | min = VM_EXIT_SAVE_DEBUG_CONTROLS | VM_EXIT_ACK_INTR_ON_EXIT; | ||
| 2296 | #ifdef CONFIG_X86_64 | ||
| 2297 | min |= VM_EXIT_HOST_ADDR_SPACE_SIZE; | ||
| 2298 | #endif | ||
| 2299 | opt = VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | | ||
| 2300 | VM_EXIT_SAVE_IA32_PAT | | ||
| 2301 | VM_EXIT_LOAD_IA32_PAT | | ||
| 2302 | VM_EXIT_LOAD_IA32_EFER | | ||
| 2303 | VM_EXIT_CLEAR_BNDCFGS | | ||
| 2304 | VM_EXIT_PT_CONCEAL_PIP | | ||
| 2305 | VM_EXIT_CLEAR_IA32_RTIT_CTL; | ||
| 2306 | if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS, | ||
| 2307 | &_vmexit_control) < 0) | ||
| 2308 | return -EIO; | ||
| 2309 | |||
| 2310 | min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING; | ||
| 2311 | opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR | | ||
| 2312 | PIN_BASED_VMX_PREEMPTION_TIMER; | ||
| 2313 | if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS, | ||
| 2314 | &_pin_based_exec_control) < 0) | ||
| 2315 | return -EIO; | ||
| 2316 | |||
| 2317 | if (cpu_has_broken_vmx_preemption_timer()) | ||
| 2318 | _pin_based_exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; | ||
| 2319 | if (!(_cpu_based_2nd_exec_control & | ||
| 2320 | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)) | ||
| 2321 | _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR; | ||
| 2322 | |||
| 2323 | min = VM_ENTRY_LOAD_DEBUG_CONTROLS; | ||
| 2324 | opt = VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | | ||
| 2325 | VM_ENTRY_LOAD_IA32_PAT | | ||
| 2326 | VM_ENTRY_LOAD_IA32_EFER | | ||
| 2327 | VM_ENTRY_LOAD_BNDCFGS | | ||
| 2328 | VM_ENTRY_PT_CONCEAL_PIP | | ||
| 2329 | VM_ENTRY_LOAD_IA32_RTIT_CTL; | ||
| 2330 | if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS, | ||
| 2331 | &_vmentry_control) < 0) | ||
| 2332 | return -EIO; | ||
| 2333 | |||
| 2334 | /* | ||
| 2335 | * Some cpus support VM_{ENTRY,EXIT}_IA32_PERF_GLOBAL_CTRL but they | ||
| 2336 | * can't be used due to an errata where VM Exit may incorrectly clear | ||
| 2337 | * IA32_PERF_GLOBAL_CTRL[34:32]. Workaround the errata by using the | ||
| 2338 | * MSR load mechanism to switch IA32_PERF_GLOBAL_CTRL. | ||
| 2339 | */ | ||
| 2340 | if (boot_cpu_data.x86 == 0x6) { | ||
| 2341 | switch (boot_cpu_data.x86_model) { | ||
| 2342 | case 26: /* AAK155 */ | ||
| 2343 | case 30: /* AAP115 */ | ||
| 2344 | case 37: /* AAT100 */ | ||
| 2345 | case 44: /* BC86,AAY89,BD102 */ | ||
| 2346 | case 46: /* BA97 */ | ||
| 2347 | _vmexit_control &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; | ||
| 2348 | _vmexit_control &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; | ||
| 2349 | pr_warn_once("kvm: VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL " | ||
| 2350 | "does not work properly. Using workaround\n"); | ||
| 2351 | break; | ||
| 2352 | default: | ||
| 2353 | break; | ||
| 2354 | } | ||
| 2355 | } | ||
| 2356 | |||
| 2357 | |||
| 2358 | rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high); | ||
| 2359 | |||
| 2360 | /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */ | ||
| 2361 | if ((vmx_msr_high & 0x1fff) > PAGE_SIZE) | ||
| 2362 | return -EIO; | ||
| 2363 | |||
| 2364 | #ifdef CONFIG_X86_64 | ||
| 2365 | /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */ | ||
| 2366 | if (vmx_msr_high & (1u<<16)) | ||
| 2367 | return -EIO; | ||
| 2368 | #endif | ||
| 2369 | |||
| 2370 | /* Require Write-Back (WB) memory type for VMCS accesses. */ | ||
| 2371 | if (((vmx_msr_high >> 18) & 15) != 6) | ||
| 2372 | return -EIO; | ||
| 2373 | |||
| 2374 | vmcs_conf->size = vmx_msr_high & 0x1fff; | ||
| 2375 | vmcs_conf->order = get_order(vmcs_conf->size); | ||
| 2376 | vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff; | ||
| 2377 | |||
| 2378 | vmcs_conf->revision_id = vmx_msr_low; | ||
| 2379 | |||
| 2380 | vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control; | ||
| 2381 | vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control; | ||
| 2382 | vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control; | ||
| 2383 | vmcs_conf->vmexit_ctrl = _vmexit_control; | ||
| 2384 | vmcs_conf->vmentry_ctrl = _vmentry_control; | ||
| 2385 | |||
| 2386 | if (static_branch_unlikely(&enable_evmcs)) | ||
| 2387 | evmcs_sanitize_exec_ctrls(vmcs_conf); | ||
| 2388 | |||
| 2389 | return 0; | ||
| 2390 | } | ||
| 2391 | |||
| 2392 | struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu) | ||
| 2393 | { | ||
| 2394 | int node = cpu_to_node(cpu); | ||
| 2395 | struct page *pages; | ||
| 2396 | struct vmcs *vmcs; | ||
| 2397 | |||
| 2398 | pages = __alloc_pages_node(node, GFP_KERNEL, vmcs_config.order); | ||
| 2399 | if (!pages) | ||
| 2400 | return NULL; | ||
| 2401 | vmcs = page_address(pages); | ||
| 2402 | memset(vmcs, 0, vmcs_config.size); | ||
| 2403 | |||
| 2404 | /* KVM supports Enlightened VMCS v1 only */ | ||
| 2405 | if (static_branch_unlikely(&enable_evmcs)) | ||
| 2406 | vmcs->hdr.revision_id = KVM_EVMCS_VERSION; | ||
| 2407 | else | ||
| 2408 | vmcs->hdr.revision_id = vmcs_config.revision_id; | ||
| 2409 | |||
| 2410 | if (shadow) | ||
| 2411 | vmcs->hdr.shadow_vmcs = 1; | ||
| 2412 | return vmcs; | ||
| 2413 | } | ||
| 2414 | |||
| 2415 | void free_vmcs(struct vmcs *vmcs) | ||
| 2416 | { | ||
| 2417 | free_pages((unsigned long)vmcs, vmcs_config.order); | ||
| 2418 | } | ||
| 2419 | |||
| 2420 | /* | ||
| 2421 | * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded | ||
| 2422 | */ | ||
| 2423 | void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) | ||
| 2424 | { | ||
| 2425 | if (!loaded_vmcs->vmcs) | ||
| 2426 | return; | ||
| 2427 | loaded_vmcs_clear(loaded_vmcs); | ||
| 2428 | free_vmcs(loaded_vmcs->vmcs); | ||
| 2429 | loaded_vmcs->vmcs = NULL; | ||
| 2430 | if (loaded_vmcs->msr_bitmap) | ||
| 2431 | free_page((unsigned long)loaded_vmcs->msr_bitmap); | ||
| 2432 | WARN_ON(loaded_vmcs->shadow_vmcs != NULL); | ||
| 2433 | } | ||
| 2434 | |||
| 2435 | int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) | ||
| 2436 | { | ||
| 2437 | loaded_vmcs->vmcs = alloc_vmcs(false); | ||
| 2438 | if (!loaded_vmcs->vmcs) | ||
| 2439 | return -ENOMEM; | ||
| 2440 | |||
| 2441 | loaded_vmcs->shadow_vmcs = NULL; | ||
| 2442 | loaded_vmcs_init(loaded_vmcs); | ||
| 2443 | |||
| 2444 | if (cpu_has_vmx_msr_bitmap()) { | ||
| 2445 | loaded_vmcs->msr_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); | ||
| 2446 | if (!loaded_vmcs->msr_bitmap) | ||
| 2447 | goto out_vmcs; | ||
| 2448 | memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE); | ||
| 2449 | |||
| 2450 | if (IS_ENABLED(CONFIG_HYPERV) && | ||
| 2451 | static_branch_unlikely(&enable_evmcs) && | ||
| 2452 | (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) { | ||
| 2453 | struct hv_enlightened_vmcs *evmcs = | ||
| 2454 | (struct hv_enlightened_vmcs *)loaded_vmcs->vmcs; | ||
| 2455 | |||
| 2456 | evmcs->hv_enlightenments_control.msr_bitmap = 1; | ||
| 2457 | } | ||
| 2458 | } | ||
| 2459 | |||
| 2460 | memset(&loaded_vmcs->host_state, 0, sizeof(struct vmcs_host_state)); | ||
| 2461 | |||
| 2462 | return 0; | ||
| 2463 | |||
| 2464 | out_vmcs: | ||
| 2465 | free_loaded_vmcs(loaded_vmcs); | ||
| 2466 | return -ENOMEM; | ||
| 2467 | } | ||
| 2468 | |||
| 2469 | static void free_kvm_area(void) | ||
| 2470 | { | ||
| 2471 | int cpu; | ||
| 2472 | |||
| 2473 | for_each_possible_cpu(cpu) { | ||
| 2474 | free_vmcs(per_cpu(vmxarea, cpu)); | ||
| 2475 | per_cpu(vmxarea, cpu) = NULL; | ||
| 2476 | } | ||
| 2477 | } | ||
| 2478 | |||
| 2479 | static __init int alloc_kvm_area(void) | ||
| 2480 | { | ||
| 2481 | int cpu; | ||
| 2482 | |||
| 2483 | for_each_possible_cpu(cpu) { | ||
| 2484 | struct vmcs *vmcs; | ||
| 2485 | |||
| 2486 | vmcs = alloc_vmcs_cpu(false, cpu); | ||
| 2487 | if (!vmcs) { | ||
| 2488 | free_kvm_area(); | ||
| 2489 | return -ENOMEM; | ||
| 2490 | } | ||
| 2491 | |||
| 2492 | /* | ||
| 2493 | * When eVMCS is enabled, alloc_vmcs_cpu() sets | ||
| 2494 | * vmcs->revision_id to KVM_EVMCS_VERSION instead of | ||
| 2495 | * revision_id reported by MSR_IA32_VMX_BASIC. | ||
| 2496 | * | ||
| 2497 | * However, even though not explictly documented by | ||
| 2498 | * TLFS, VMXArea passed as VMXON argument should | ||
| 2499 | * still be marked with revision_id reported by | ||
| 2500 | * physical CPU. | ||
| 2501 | */ | ||
| 2502 | if (static_branch_unlikely(&enable_evmcs)) | ||
| 2503 | vmcs->hdr.revision_id = vmcs_config.revision_id; | ||
| 2504 | |||
| 2505 | per_cpu(vmxarea, cpu) = vmcs; | ||
| 2506 | } | ||
| 2507 | return 0; | ||
| 2508 | } | ||
| 2509 | |||
| 2510 | static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg, | ||
| 2511 | struct kvm_segment *save) | ||
| 2512 | { | ||
| 2513 | if (!emulate_invalid_guest_state) { | ||
| 2514 | /* | ||
| 2515 | * CS and SS RPL should be equal during guest entry according | ||
| 2516 | * to VMX spec, but in reality it is not always so. Since vcpu | ||
| 2517 | * is in the middle of the transition from real mode to | ||
| 2518 | * protected mode it is safe to assume that RPL 0 is a good | ||
| 2519 | * default value. | ||
| 2520 | */ | ||
| 2521 | if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS) | ||
| 2522 | save->selector &= ~SEGMENT_RPL_MASK; | ||
| 2523 | save->dpl = save->selector & SEGMENT_RPL_MASK; | ||
| 2524 | save->s = 1; | ||
| 2525 | } | ||
| 2526 | vmx_set_segment(vcpu, save, seg); | ||
| 2527 | } | ||
| 2528 | |||
| 2529 | static void enter_pmode(struct kvm_vcpu *vcpu) | ||
| 2530 | { | ||
| 2531 | unsigned long flags; | ||
| 2532 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 2533 | |||
| 2534 | /* | ||
| 2535 | * Update real mode segment cache. It may be not up-to-date if sement | ||
| 2536 | * register was written while vcpu was in a guest mode. | ||
| 2537 | */ | ||
| 2538 | vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES); | ||
| 2539 | vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS); | ||
| 2540 | vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS); | ||
| 2541 | vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS); | ||
| 2542 | vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS); | ||
| 2543 | vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS); | ||
| 2544 | |||
| 2545 | vmx->rmode.vm86_active = 0; | ||
| 2546 | |||
| 2547 | vmx_segment_cache_clear(vmx); | ||
| 2548 | |||
| 2549 | vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR); | ||
| 2550 | |||
| 2551 | flags = vmcs_readl(GUEST_RFLAGS); | ||
| 2552 | flags &= RMODE_GUEST_OWNED_EFLAGS_BITS; | ||
| 2553 | flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS; | ||
| 2554 | vmcs_writel(GUEST_RFLAGS, flags); | ||
| 2555 | |||
| 2556 | vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) | | ||
| 2557 | (vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME)); | ||
| 2558 | |||
| 2559 | update_exception_bitmap(vcpu); | ||
| 2560 | |||
| 2561 | fix_pmode_seg(vcpu, VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]); | ||
| 2562 | fix_pmode_seg(vcpu, VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]); | ||
| 2563 | fix_pmode_seg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]); | ||
| 2564 | fix_pmode_seg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]); | ||
| 2565 | fix_pmode_seg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]); | ||
| 2566 | fix_pmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]); | ||
| 2567 | } | ||
| 2568 | |||
| 2569 | static void fix_rmode_seg(int seg, struct kvm_segment *save) | ||
| 2570 | { | ||
| 2571 | const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; | ||
| 2572 | struct kvm_segment var = *save; | ||
| 2573 | |||
| 2574 | var.dpl = 0x3; | ||
| 2575 | if (seg == VCPU_SREG_CS) | ||
| 2576 | var.type = 0x3; | ||
| 2577 | |||
| 2578 | if (!emulate_invalid_guest_state) { | ||
| 2579 | var.selector = var.base >> 4; | ||
| 2580 | var.base = var.base & 0xffff0; | ||
| 2581 | var.limit = 0xffff; | ||
| 2582 | var.g = 0; | ||
| 2583 | var.db = 0; | ||
| 2584 | var.present = 1; | ||
| 2585 | var.s = 1; | ||
| 2586 | var.l = 0; | ||
| 2587 | var.unusable = 0; | ||
| 2588 | var.type = 0x3; | ||
| 2589 | var.avl = 0; | ||
| 2590 | if (save->base & 0xf) | ||
| 2591 | printk_once(KERN_WARNING "kvm: segment base is not " | ||
| 2592 | "paragraph aligned when entering " | ||
| 2593 | "protected mode (seg=%d)", seg); | ||
| 2594 | } | ||
| 2595 | |||
| 2596 | vmcs_write16(sf->selector, var.selector); | ||
| 2597 | vmcs_writel(sf->base, var.base); | ||
| 2598 | vmcs_write32(sf->limit, var.limit); | ||
| 2599 | vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(&var)); | ||
| 2600 | } | ||
| 2601 | |||
| 2602 | static void enter_rmode(struct kvm_vcpu *vcpu) | ||
| 2603 | { | ||
| 2604 | unsigned long flags; | ||
| 2605 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 2606 | struct kvm_vmx *kvm_vmx = to_kvm_vmx(vcpu->kvm); | ||
| 2607 | |||
| 2608 | vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR); | ||
| 2609 | vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES); | ||
| 2610 | vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS); | ||
| 2611 | vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS); | ||
| 2612 | vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS); | ||
| 2613 | vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS); | ||
| 2614 | vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS); | ||
| 2615 | |||
| 2616 | vmx->rmode.vm86_active = 1; | ||
| 2617 | |||
| 2618 | /* | ||
| 2619 | * Very old userspace does not call KVM_SET_TSS_ADDR before entering | ||
| 2620 | * vcpu. Warn the user that an update is overdue. | ||
| 2621 | */ | ||
| 2622 | if (!kvm_vmx->tss_addr) | ||
| 2623 | printk_once(KERN_WARNING "kvm: KVM_SET_TSS_ADDR need to be " | ||
| 2624 | "called before entering vcpu\n"); | ||
| 2625 | |||
| 2626 | vmx_segment_cache_clear(vmx); | ||
| 2627 | |||
| 2628 | vmcs_writel(GUEST_TR_BASE, kvm_vmx->tss_addr); | ||
| 2629 | vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1); | ||
| 2630 | vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); | ||
| 2631 | |||
| 2632 | flags = vmcs_readl(GUEST_RFLAGS); | ||
| 2633 | vmx->rmode.save_rflags = flags; | ||
| 2634 | |||
| 2635 | flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; | ||
| 2636 | |||
| 2637 | vmcs_writel(GUEST_RFLAGS, flags); | ||
| 2638 | vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME); | ||
| 2639 | update_exception_bitmap(vcpu); | ||
| 2640 | |||
| 2641 | fix_rmode_seg(VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]); | ||
| 2642 | fix_rmode_seg(VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]); | ||
| 2643 | fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]); | ||
| 2644 | fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]); | ||
| 2645 | fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]); | ||
| 2646 | fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]); | ||
| 2647 | |||
| 2648 | kvm_mmu_reset_context(vcpu); | ||
| 2649 | } | ||
| 2650 | |||
| 2651 | void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) | ||
| 2652 | { | ||
| 2653 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 2654 | struct shared_msr_entry *msr = find_msr_entry(vmx, MSR_EFER); | ||
| 2655 | |||
| 2656 | if (!msr) | ||
| 2657 | return; | ||
| 2658 | |||
| 2659 | vcpu->arch.efer = efer; | ||
| 2660 | if (efer & EFER_LMA) { | ||
| 2661 | vm_entry_controls_setbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE); | ||
| 2662 | msr->data = efer; | ||
| 2663 | } else { | ||
| 2664 | vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE); | ||
| 2665 | |||
| 2666 | msr->data = efer & ~EFER_LME; | ||
| 2667 | } | ||
| 2668 | setup_msrs(vmx); | ||
| 2669 | } | ||
| 2670 | |||
| 2671 | #ifdef CONFIG_X86_64 | ||
| 2672 | |||
| 2673 | static void enter_lmode(struct kvm_vcpu *vcpu) | ||
| 2674 | { | ||
| 2675 | u32 guest_tr_ar; | ||
| 2676 | |||
| 2677 | vmx_segment_cache_clear(to_vmx(vcpu)); | ||
| 2678 | |||
| 2679 | guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES); | ||
| 2680 | if ((guest_tr_ar & VMX_AR_TYPE_MASK) != VMX_AR_TYPE_BUSY_64_TSS) { | ||
| 2681 | pr_debug_ratelimited("%s: tss fixup for long mode. \n", | ||
| 2682 | __func__); | ||
| 2683 | vmcs_write32(GUEST_TR_AR_BYTES, | ||
| 2684 | (guest_tr_ar & ~VMX_AR_TYPE_MASK) | ||
| 2685 | | VMX_AR_TYPE_BUSY_64_TSS); | ||
| 2686 | } | ||
| 2687 | vmx_set_efer(vcpu, vcpu->arch.efer | EFER_LMA); | ||
| 2688 | } | ||
| 2689 | |||
| 2690 | static void exit_lmode(struct kvm_vcpu *vcpu) | ||
| 2691 | { | ||
| 2692 | vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE); | ||
| 2693 | vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA); | ||
| 2694 | } | ||
| 2695 | |||
| 2696 | #endif | ||
| 2697 | |||
| 2698 | static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr) | ||
| 2699 | { | ||
| 2700 | int vpid = to_vmx(vcpu)->vpid; | ||
| 2701 | |||
| 2702 | if (!vpid_sync_vcpu_addr(vpid, addr)) | ||
| 2703 | vpid_sync_context(vpid); | ||
| 2704 | |||
| 2705 | /* | ||
| 2706 | * If VPIDs are not supported or enabled, then the above is a no-op. | ||
| 2707 | * But we don't really need a TLB flush in that case anyway, because | ||
| 2708 | * each VM entry/exit includes an implicit flush when VPID is 0. | ||
| 2709 | */ | ||
| 2710 | } | ||
| 2711 | |||
| 2712 | static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) | ||
| 2713 | { | ||
| 2714 | ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits; | ||
| 2715 | |||
| 2716 | vcpu->arch.cr0 &= ~cr0_guest_owned_bits; | ||
| 2717 | vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits; | ||
| 2718 | } | ||
| 2719 | |||
| 2720 | static void vmx_decache_cr3(struct kvm_vcpu *vcpu) | ||
| 2721 | { | ||
| 2722 | if (enable_unrestricted_guest || (enable_ept && is_paging(vcpu))) | ||
| 2723 | vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); | ||
| 2724 | __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); | ||
| 2725 | } | ||
| 2726 | |||
| 2727 | static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) | ||
| 2728 | { | ||
| 2729 | ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits; | ||
| 2730 | |||
| 2731 | vcpu->arch.cr4 &= ~cr4_guest_owned_bits; | ||
| 2732 | vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & cr4_guest_owned_bits; | ||
| 2733 | } | ||
| 2734 | |||
| 2735 | static void ept_load_pdptrs(struct kvm_vcpu *vcpu) | ||
| 2736 | { | ||
| 2737 | struct kvm_mmu *mmu = vcpu->arch.walk_mmu; | ||
| 2738 | |||
| 2739 | if (!test_bit(VCPU_EXREG_PDPTR, | ||
| 2740 | (unsigned long *)&vcpu->arch.regs_dirty)) | ||
| 2741 | return; | ||
| 2742 | |||
| 2743 | if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { | ||
| 2744 | vmcs_write64(GUEST_PDPTR0, mmu->pdptrs[0]); | ||
| 2745 | vmcs_write64(GUEST_PDPTR1, mmu->pdptrs[1]); | ||
| 2746 | vmcs_write64(GUEST_PDPTR2, mmu->pdptrs[2]); | ||
| 2747 | vmcs_write64(GUEST_PDPTR3, mmu->pdptrs[3]); | ||
| 2748 | } | ||
| 2749 | } | ||
| 2750 | |||
| 2751 | void ept_save_pdptrs(struct kvm_vcpu *vcpu) | ||
| 2752 | { | ||
| 2753 | struct kvm_mmu *mmu = vcpu->arch.walk_mmu; | ||
| 2754 | |||
| 2755 | if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { | ||
| 2756 | mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0); | ||
| 2757 | mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1); | ||
| 2758 | mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2); | ||
| 2759 | mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3); | ||
| 2760 | } | ||
| 2761 | |||
| 2762 | __set_bit(VCPU_EXREG_PDPTR, | ||
| 2763 | (unsigned long *)&vcpu->arch.regs_avail); | ||
| 2764 | __set_bit(VCPU_EXREG_PDPTR, | ||
| 2765 | (unsigned long *)&vcpu->arch.regs_dirty); | ||
| 2766 | } | ||
| 2767 | |||
| 2768 | static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, | ||
| 2769 | unsigned long cr0, | ||
| 2770 | struct kvm_vcpu *vcpu) | ||
| 2771 | { | ||
| 2772 | if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail)) | ||
| 2773 | vmx_decache_cr3(vcpu); | ||
| 2774 | if (!(cr0 & X86_CR0_PG)) { | ||
| 2775 | /* From paging/starting to nonpaging */ | ||
| 2776 | vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, | ||
| 2777 | vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) | | ||
| 2778 | (CPU_BASED_CR3_LOAD_EXITING | | ||
| 2779 | CPU_BASED_CR3_STORE_EXITING)); | ||
| 2780 | vcpu->arch.cr0 = cr0; | ||
| 2781 | vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); | ||
| 2782 | } else if (!is_paging(vcpu)) { | ||
| 2783 | /* From nonpaging to paging */ | ||
| 2784 | vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, | ||
| 2785 | vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) & | ||
| 2786 | ~(CPU_BASED_CR3_LOAD_EXITING | | ||
| 2787 | CPU_BASED_CR3_STORE_EXITING)); | ||
| 2788 | vcpu->arch.cr0 = cr0; | ||
| 2789 | vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); | ||
| 2790 | } | ||
| 2791 | |||
| 2792 | if (!(cr0 & X86_CR0_WP)) | ||
| 2793 | *hw_cr0 &= ~X86_CR0_WP; | ||
| 2794 | } | ||
| 2795 | |||
| 2796 | void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) | ||
| 2797 | { | ||
| 2798 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 2799 | unsigned long hw_cr0; | ||
| 2800 | |||
| 2801 | hw_cr0 = (cr0 & ~KVM_VM_CR0_ALWAYS_OFF); | ||
| 2802 | if (enable_unrestricted_guest) | ||
| 2803 | hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST; | ||
| 2804 | else { | ||
| 2805 | hw_cr0 |= KVM_VM_CR0_ALWAYS_ON; | ||
| 2806 | |||
| 2807 | if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE)) | ||
| 2808 | enter_pmode(vcpu); | ||
| 2809 | |||
| 2810 | if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE)) | ||
| 2811 | enter_rmode(vcpu); | ||
| 2812 | } | ||
| 2813 | |||
| 2814 | #ifdef CONFIG_X86_64 | ||
| 2815 | if (vcpu->arch.efer & EFER_LME) { | ||
| 2816 | if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) | ||
| 2817 | enter_lmode(vcpu); | ||
| 2818 | if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) | ||
| 2819 | exit_lmode(vcpu); | ||
| 2820 | } | ||
| 2821 | #endif | ||
| 2822 | |||
| 2823 | if (enable_ept && !enable_unrestricted_guest) | ||
| 2824 | ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu); | ||
| 2825 | |||
| 2826 | vmcs_writel(CR0_READ_SHADOW, cr0); | ||
| 2827 | vmcs_writel(GUEST_CR0, hw_cr0); | ||
| 2828 | vcpu->arch.cr0 = cr0; | ||
| 2829 | |||
| 2830 | /* depends on vcpu->arch.cr0 to be set to a new value */ | ||
| 2831 | vmx->emulation_required = emulation_required(vcpu); | ||
| 2832 | } | ||
| 2833 | |||
| 2834 | static int get_ept_level(struct kvm_vcpu *vcpu) | ||
| 2835 | { | ||
| 2836 | if (cpu_has_vmx_ept_5levels() && (cpuid_maxphyaddr(vcpu) > 48)) | ||
| 2837 | return 5; | ||
| 2838 | return 4; | ||
| 2839 | } | ||
| 2840 | |||
| 2841 | u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa) | ||
| 2842 | { | ||
| 2843 | u64 eptp = VMX_EPTP_MT_WB; | ||
| 2844 | |||
| 2845 | eptp |= (get_ept_level(vcpu) == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4; | ||
| 2846 | |||
| 2847 | if (enable_ept_ad_bits && | ||
| 2848 | (!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu))) | ||
| 2849 | eptp |= VMX_EPTP_AD_ENABLE_BIT; | ||
| 2850 | eptp |= (root_hpa & PAGE_MASK); | ||
| 2851 | |||
| 2852 | return eptp; | ||
| 2853 | } | ||
| 2854 | |||
| 2855 | void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) | ||
| 2856 | { | ||
| 2857 | struct kvm *kvm = vcpu->kvm; | ||
| 2858 | unsigned long guest_cr3; | ||
| 2859 | u64 eptp; | ||
| 2860 | |||
| 2861 | guest_cr3 = cr3; | ||
| 2862 | if (enable_ept) { | ||
| 2863 | eptp = construct_eptp(vcpu, cr3); | ||
| 2864 | vmcs_write64(EPT_POINTER, eptp); | ||
| 2865 | |||
| 2866 | if (kvm_x86_ops->tlb_remote_flush) { | ||
| 2867 | spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock); | ||
| 2868 | to_vmx(vcpu)->ept_pointer = eptp; | ||
| 2869 | to_kvm_vmx(kvm)->ept_pointers_match | ||
| 2870 | = EPT_POINTERS_CHECK; | ||
| 2871 | spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock); | ||
| 2872 | } | ||
| 2873 | |||
| 2874 | if (enable_unrestricted_guest || is_paging(vcpu) || | ||
| 2875 | is_guest_mode(vcpu)) | ||
| 2876 | guest_cr3 = kvm_read_cr3(vcpu); | ||
| 2877 | else | ||
| 2878 | guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr; | ||
| 2879 | ept_load_pdptrs(vcpu); | ||
| 2880 | } | ||
| 2881 | |||
| 2882 | vmcs_writel(GUEST_CR3, guest_cr3); | ||
| 2883 | } | ||
| 2884 | |||
| 2885 | int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) | ||
| 2886 | { | ||
| 2887 | /* | ||
| 2888 | * Pass through host's Machine Check Enable value to hw_cr4, which | ||
| 2889 | * is in force while we are in guest mode. Do not let guests control | ||
| 2890 | * this bit, even if host CR4.MCE == 0. | ||
| 2891 | */ | ||
| 2892 | unsigned long hw_cr4; | ||
| 2893 | |||
| 2894 | hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE); | ||
| 2895 | if (enable_unrestricted_guest) | ||
| 2896 | hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST; | ||
| 2897 | else if (to_vmx(vcpu)->rmode.vm86_active) | ||
| 2898 | hw_cr4 |= KVM_RMODE_VM_CR4_ALWAYS_ON; | ||
| 2899 | else | ||
| 2900 | hw_cr4 |= KVM_PMODE_VM_CR4_ALWAYS_ON; | ||
| 2901 | |||
| 2902 | if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated()) { | ||
| 2903 | if (cr4 & X86_CR4_UMIP) { | ||
| 2904 | vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL, | ||
| 2905 | SECONDARY_EXEC_DESC); | ||
| 2906 | hw_cr4 &= ~X86_CR4_UMIP; | ||
| 2907 | } else if (!is_guest_mode(vcpu) || | ||
| 2908 | !nested_cpu_has2(get_vmcs12(vcpu), SECONDARY_EXEC_DESC)) | ||
| 2909 | vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, | ||
| 2910 | SECONDARY_EXEC_DESC); | ||
| 2911 | } | ||
| 2912 | |||
| 2913 | if (cr4 & X86_CR4_VMXE) { | ||
| 2914 | /* | ||
| 2915 | * To use VMXON (and later other VMX instructions), a guest | ||
| 2916 | * must first be able to turn on cr4.VMXE (see handle_vmon()). | ||
| 2917 | * So basically the check on whether to allow nested VMX | ||
| 2918 | * is here. We operate under the default treatment of SMM, | ||
| 2919 | * so VMX cannot be enabled under SMM. | ||
| 2920 | */ | ||
| 2921 | if (!nested_vmx_allowed(vcpu) || is_smm(vcpu)) | ||
| 2922 | return 1; | ||
| 2923 | } | ||
| 2924 | |||
| 2925 | if (to_vmx(vcpu)->nested.vmxon && !nested_cr4_valid(vcpu, cr4)) | ||
| 2926 | return 1; | ||
| 2927 | |||
| 2928 | vcpu->arch.cr4 = cr4; | ||
| 2929 | |||
| 2930 | if (!enable_unrestricted_guest) { | ||
| 2931 | if (enable_ept) { | ||
| 2932 | if (!is_paging(vcpu)) { | ||
| 2933 | hw_cr4 &= ~X86_CR4_PAE; | ||
| 2934 | hw_cr4 |= X86_CR4_PSE; | ||
| 2935 | } else if (!(cr4 & X86_CR4_PAE)) { | ||
| 2936 | hw_cr4 &= ~X86_CR4_PAE; | ||
| 2937 | } | ||
| 2938 | } | ||
| 2939 | |||
| 2940 | /* | ||
| 2941 | * SMEP/SMAP/PKU is disabled if CPU is in non-paging mode in | ||
| 2942 | * hardware. To emulate this behavior, SMEP/SMAP/PKU needs | ||
| 2943 | * to be manually disabled when guest switches to non-paging | ||
| 2944 | * mode. | ||
| 2945 | * | ||
| 2946 | * If !enable_unrestricted_guest, the CPU is always running | ||
| 2947 | * with CR0.PG=1 and CR4 needs to be modified. | ||
| 2948 | * If enable_unrestricted_guest, the CPU automatically | ||
| 2949 | * disables SMEP/SMAP/PKU when the guest sets CR0.PG=0. | ||
| 2950 | */ | ||
| 2951 | if (!is_paging(vcpu)) | ||
| 2952 | hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE); | ||
| 2953 | } | ||
| 2954 | |||
| 2955 | vmcs_writel(CR4_READ_SHADOW, cr4); | ||
| 2956 | vmcs_writel(GUEST_CR4, hw_cr4); | ||
| 2957 | return 0; | ||
| 2958 | } | ||
| 2959 | |||
| 2960 | void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) | ||
| 2961 | { | ||
| 2962 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 2963 | u32 ar; | ||
| 2964 | |||
| 2965 | if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) { | ||
| 2966 | *var = vmx->rmode.segs[seg]; | ||
| 2967 | if (seg == VCPU_SREG_TR | ||
| 2968 | || var->selector == vmx_read_guest_seg_selector(vmx, seg)) | ||
| 2969 | return; | ||
| 2970 | var->base = vmx_read_guest_seg_base(vmx, seg); | ||
| 2971 | var->selector = vmx_read_guest_seg_selector(vmx, seg); | ||
| 2972 | return; | ||
| 2973 | } | ||
| 2974 | var->base = vmx_read_guest_seg_base(vmx, seg); | ||
| 2975 | var->limit = vmx_read_guest_seg_limit(vmx, seg); | ||
| 2976 | var->selector = vmx_read_guest_seg_selector(vmx, seg); | ||
| 2977 | ar = vmx_read_guest_seg_ar(vmx, seg); | ||
| 2978 | var->unusable = (ar >> 16) & 1; | ||
| 2979 | var->type = ar & 15; | ||
| 2980 | var->s = (ar >> 4) & 1; | ||
| 2981 | var->dpl = (ar >> 5) & 3; | ||
| 2982 | /* | ||
| 2983 | * Some userspaces do not preserve unusable property. Since usable | ||
| 2984 | * segment has to be present according to VMX spec we can use present | ||
| 2985 | * property to amend userspace bug by making unusable segment always | ||
| 2986 | * nonpresent. vmx_segment_access_rights() already marks nonpresent | ||
| 2987 | * segment as unusable. | ||
| 2988 | */ | ||
| 2989 | var->present = !var->unusable; | ||
| 2990 | var->avl = (ar >> 12) & 1; | ||
| 2991 | var->l = (ar >> 13) & 1; | ||
| 2992 | var->db = (ar >> 14) & 1; | ||
| 2993 | var->g = (ar >> 15) & 1; | ||
| 2994 | } | ||
| 2995 | |||
| 2996 | static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg) | ||
| 2997 | { | ||
| 2998 | struct kvm_segment s; | ||
| 2999 | |||
| 3000 | if (to_vmx(vcpu)->rmode.vm86_active) { | ||
| 3001 | vmx_get_segment(vcpu, &s, seg); | ||
| 3002 | return s.base; | ||
| 3003 | } | ||
| 3004 | return vmx_read_guest_seg_base(to_vmx(vcpu), seg); | ||
| 3005 | } | ||
| 3006 | |||
| 3007 | int vmx_get_cpl(struct kvm_vcpu *vcpu) | ||
| 3008 | { | ||
| 3009 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 3010 | |||
| 3011 | if (unlikely(vmx->rmode.vm86_active)) | ||
| 3012 | return 0; | ||
| 3013 | else { | ||
| 3014 | int ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS); | ||
| 3015 | return VMX_AR_DPL(ar); | ||
| 3016 | } | ||
| 3017 | } | ||
| 3018 | |||
| 3019 | static u32 vmx_segment_access_rights(struct kvm_segment *var) | ||
| 3020 | { | ||
| 3021 | u32 ar; | ||
| 3022 | |||
| 3023 | if (var->unusable || !var->present) | ||
| 3024 | ar = 1 << 16; | ||
| 3025 | else { | ||
| 3026 | ar = var->type & 15; | ||
| 3027 | ar |= (var->s & 1) << 4; | ||
| 3028 | ar |= (var->dpl & 3) << 5; | ||
| 3029 | ar |= (var->present & 1) << 7; | ||
| 3030 | ar |= (var->avl & 1) << 12; | ||
| 3031 | ar |= (var->l & 1) << 13; | ||
| 3032 | ar |= (var->db & 1) << 14; | ||
| 3033 | ar |= (var->g & 1) << 15; | ||
| 3034 | } | ||
| 3035 | |||
| 3036 | return ar; | ||
| 3037 | } | ||
| 3038 | |||
| 3039 | void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) | ||
| 3040 | { | ||
| 3041 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 3042 | const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; | ||
| 3043 | |||
| 3044 | vmx_segment_cache_clear(vmx); | ||
| 3045 | |||
| 3046 | if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) { | ||
| 3047 | vmx->rmode.segs[seg] = *var; | ||
| 3048 | if (seg == VCPU_SREG_TR) | ||
| 3049 | vmcs_write16(sf->selector, var->selector); | ||
| 3050 | else if (var->s) | ||
| 3051 | fix_rmode_seg(seg, &vmx->rmode.segs[seg]); | ||
| 3052 | goto out; | ||
| 3053 | } | ||
| 3054 | |||
| 3055 | vmcs_writel(sf->base, var->base); | ||
| 3056 | vmcs_write32(sf->limit, var->limit); | ||
| 3057 | vmcs_write16(sf->selector, var->selector); | ||
| 3058 | |||
| 3059 | /* | ||
| 3060 | * Fix the "Accessed" bit in AR field of segment registers for older | ||
| 3061 | * qemu binaries. | ||
| 3062 | * IA32 arch specifies that at the time of processor reset the | ||
| 3063 | * "Accessed" bit in the AR field of segment registers is 1. And qemu | ||
| 3064 | * is setting it to 0 in the userland code. This causes invalid guest | ||
| 3065 | * state vmexit when "unrestricted guest" mode is turned on. | ||
| 3066 | * Fix for this setup issue in cpu_reset is being pushed in the qemu | ||
| 3067 | * tree. Newer qemu binaries with that qemu fix would not need this | ||
| 3068 | * kvm hack. | ||
| 3069 | */ | ||
| 3070 | if (enable_unrestricted_guest && (seg != VCPU_SREG_LDTR)) | ||
| 3071 | var->type |= 0x1; /* Accessed */ | ||
| 3072 | |||
| 3073 | vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var)); | ||
| 3074 | |||
| 3075 | out: | ||
| 3076 | vmx->emulation_required = emulation_required(vcpu); | ||
| 3077 | } | ||
| 3078 | |||
| 3079 | static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) | ||
| 3080 | { | ||
| 3081 | u32 ar = vmx_read_guest_seg_ar(to_vmx(vcpu), VCPU_SREG_CS); | ||
| 3082 | |||
| 3083 | *db = (ar >> 14) & 1; | ||
| 3084 | *l = (ar >> 13) & 1; | ||
| 3085 | } | ||
| 3086 | |||
| 3087 | static void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) | ||
| 3088 | { | ||
| 3089 | dt->size = vmcs_read32(GUEST_IDTR_LIMIT); | ||
| 3090 | dt->address = vmcs_readl(GUEST_IDTR_BASE); | ||
| 3091 | } | ||
| 3092 | |||
| 3093 | static void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) | ||
| 3094 | { | ||
| 3095 | vmcs_write32(GUEST_IDTR_LIMIT, dt->size); | ||
| 3096 | vmcs_writel(GUEST_IDTR_BASE, dt->address); | ||
| 3097 | } | ||
| 3098 | |||
| 3099 | static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) | ||
| 3100 | { | ||
| 3101 | dt->size = vmcs_read32(GUEST_GDTR_LIMIT); | ||
| 3102 | dt->address = vmcs_readl(GUEST_GDTR_BASE); | ||
| 3103 | } | ||
| 3104 | |||
| 3105 | static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) | ||
| 3106 | { | ||
| 3107 | vmcs_write32(GUEST_GDTR_LIMIT, dt->size); | ||
| 3108 | vmcs_writel(GUEST_GDTR_BASE, dt->address); | ||
| 3109 | } | ||
| 3110 | |||
| 3111 | static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg) | ||
| 3112 | { | ||
| 3113 | struct kvm_segment var; | ||
| 3114 | u32 ar; | ||
| 3115 | |||
| 3116 | vmx_get_segment(vcpu, &var, seg); | ||
| 3117 | var.dpl = 0x3; | ||
| 3118 | if (seg == VCPU_SREG_CS) | ||
| 3119 | var.type = 0x3; | ||
| 3120 | ar = vmx_segment_access_rights(&var); | ||
| 3121 | |||
| 3122 | if (var.base != (var.selector << 4)) | ||
| 3123 | return false; | ||
| 3124 | if (var.limit != 0xffff) | ||
| 3125 | return false; | ||
| 3126 | if (ar != 0xf3) | ||
| 3127 | return false; | ||
| 3128 | |||
| 3129 | return true; | ||
| 3130 | } | ||
| 3131 | |||
| 3132 | static bool code_segment_valid(struct kvm_vcpu *vcpu) | ||
| 3133 | { | ||
| 3134 | struct kvm_segment cs; | ||
| 3135 | unsigned int cs_rpl; | ||
| 3136 | |||
| 3137 | vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); | ||
| 3138 | cs_rpl = cs.selector & SEGMENT_RPL_MASK; | ||
| 3139 | |||
| 3140 | if (cs.unusable) | ||
| 3141 | return false; | ||
| 3142 | if (~cs.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_ACCESSES_MASK)) | ||
| 3143 | return false; | ||
| 3144 | if (!cs.s) | ||
| 3145 | return false; | ||
| 3146 | if (cs.type & VMX_AR_TYPE_WRITEABLE_MASK) { | ||
| 3147 | if (cs.dpl > cs_rpl) | ||
| 3148 | return false; | ||
| 3149 | } else { | ||
| 3150 | if (cs.dpl != cs_rpl) | ||
| 3151 | return false; | ||
| 3152 | } | ||
| 3153 | if (!cs.present) | ||
| 3154 | return false; | ||
| 3155 | |||
| 3156 | /* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */ | ||
| 3157 | return true; | ||
| 3158 | } | ||
| 3159 | |||
| 3160 | static bool stack_segment_valid(struct kvm_vcpu *vcpu) | ||
| 3161 | { | ||
| 3162 | struct kvm_segment ss; | ||
| 3163 | unsigned int ss_rpl; | ||
| 3164 | |||
| 3165 | vmx_get_segment(vcpu, &ss, VCPU_SREG_SS); | ||
| 3166 | ss_rpl = ss.selector & SEGMENT_RPL_MASK; | ||
| 3167 | |||
| 3168 | if (ss.unusable) | ||
| 3169 | return true; | ||
| 3170 | if (ss.type != 3 && ss.type != 7) | ||
| 3171 | return false; | ||
| 3172 | if (!ss.s) | ||
| 3173 | return false; | ||
| 3174 | if (ss.dpl != ss_rpl) /* DPL != RPL */ | ||
| 3175 | return false; | ||
| 3176 | if (!ss.present) | ||
| 3177 | return false; | ||
| 3178 | |||
| 3179 | return true; | ||
| 3180 | } | ||
| 3181 | |||
| 3182 | static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg) | ||
| 3183 | { | ||
| 3184 | struct kvm_segment var; | ||
| 3185 | unsigned int rpl; | ||
| 3186 | |||
| 3187 | vmx_get_segment(vcpu, &var, seg); | ||
| 3188 | rpl = var.selector & SEGMENT_RPL_MASK; | ||
| 3189 | |||
| 3190 | if (var.unusable) | ||
| 3191 | return true; | ||
| 3192 | if (!var.s) | ||
| 3193 | return false; | ||
| 3194 | if (!var.present) | ||
| 3195 | return false; | ||
| 3196 | if (~var.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_WRITEABLE_MASK)) { | ||
| 3197 | if (var.dpl < rpl) /* DPL < RPL */ | ||
| 3198 | return false; | ||
| 3199 | } | ||
| 3200 | |||
| 3201 | /* TODO: Add other members to kvm_segment_field to allow checking for other access | ||
| 3202 | * rights flags | ||
| 3203 | */ | ||
| 3204 | return true; | ||
| 3205 | } | ||
| 3206 | |||
| 3207 | static bool tr_valid(struct kvm_vcpu *vcpu) | ||
| 3208 | { | ||
| 3209 | struct kvm_segment tr; | ||
| 3210 | |||
| 3211 | vmx_get_segment(vcpu, &tr, VCPU_SREG_TR); | ||
| 3212 | |||
| 3213 | if (tr.unusable) | ||
| 3214 | return false; | ||
| 3215 | if (tr.selector & SEGMENT_TI_MASK) /* TI = 1 */ | ||
| 3216 | return false; | ||
| 3217 | if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */ | ||
| 3218 | return false; | ||
| 3219 | if (!tr.present) | ||
| 3220 | return false; | ||
| 3221 | |||
| 3222 | return true; | ||
| 3223 | } | ||
| 3224 | |||
| 3225 | static bool ldtr_valid(struct kvm_vcpu *vcpu) | ||
| 3226 | { | ||
| 3227 | struct kvm_segment ldtr; | ||
| 3228 | |||
| 3229 | vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR); | ||
| 3230 | |||
| 3231 | if (ldtr.unusable) | ||
| 3232 | return true; | ||
| 3233 | if (ldtr.selector & SEGMENT_TI_MASK) /* TI = 1 */ | ||
| 3234 | return false; | ||
| 3235 | if (ldtr.type != 2) | ||
| 3236 | return false; | ||
| 3237 | if (!ldtr.present) | ||
| 3238 | return false; | ||
| 3239 | |||
| 3240 | return true; | ||
| 3241 | } | ||
| 3242 | |||
| 3243 | static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu) | ||
| 3244 | { | ||
| 3245 | struct kvm_segment cs, ss; | ||
| 3246 | |||
| 3247 | vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); | ||
| 3248 | vmx_get_segment(vcpu, &ss, VCPU_SREG_SS); | ||
| 3249 | |||
| 3250 | return ((cs.selector & SEGMENT_RPL_MASK) == | ||
| 3251 | (ss.selector & SEGMENT_RPL_MASK)); | ||
| 3252 | } | ||
| 3253 | |||
| 3254 | /* | ||
| 3255 | * Check if guest state is valid. Returns true if valid, false if | ||
| 3256 | * not. | ||
| 3257 | * We assume that registers are always usable | ||
| 3258 | */ | ||
| 3259 | static bool guest_state_valid(struct kvm_vcpu *vcpu) | ||
| 3260 | { | ||
| 3261 | if (enable_unrestricted_guest) | ||
| 3262 | return true; | ||
| 3263 | |||
| 3264 | /* real mode guest state checks */ | ||
| 3265 | if (!is_protmode(vcpu) || (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) { | ||
| 3266 | if (!rmode_segment_valid(vcpu, VCPU_SREG_CS)) | ||
| 3267 | return false; | ||
| 3268 | if (!rmode_segment_valid(vcpu, VCPU_SREG_SS)) | ||
| 3269 | return false; | ||
| 3270 | if (!rmode_segment_valid(vcpu, VCPU_SREG_DS)) | ||
| 3271 | return false; | ||
| 3272 | if (!rmode_segment_valid(vcpu, VCPU_SREG_ES)) | ||
| 3273 | return false; | ||
| 3274 | if (!rmode_segment_valid(vcpu, VCPU_SREG_FS)) | ||
| 3275 | return false; | ||
| 3276 | if (!rmode_segment_valid(vcpu, VCPU_SREG_GS)) | ||
| 3277 | return false; | ||
| 3278 | } else { | ||
| 3279 | /* protected mode guest state checks */ | ||
| 3280 | if (!cs_ss_rpl_check(vcpu)) | ||
| 3281 | return false; | ||
| 3282 | if (!code_segment_valid(vcpu)) | ||
| 3283 | return false; | ||
| 3284 | if (!stack_segment_valid(vcpu)) | ||
| 3285 | return false; | ||
| 3286 | if (!data_segment_valid(vcpu, VCPU_SREG_DS)) | ||
| 3287 | return false; | ||
| 3288 | if (!data_segment_valid(vcpu, VCPU_SREG_ES)) | ||
| 3289 | return false; | ||
| 3290 | if (!data_segment_valid(vcpu, VCPU_SREG_FS)) | ||
| 3291 | return false; | ||
| 3292 | if (!data_segment_valid(vcpu, VCPU_SREG_GS)) | ||
| 3293 | return false; | ||
| 3294 | if (!tr_valid(vcpu)) | ||
| 3295 | return false; | ||
| 3296 | if (!ldtr_valid(vcpu)) | ||
| 3297 | return false; | ||
| 3298 | } | ||
| 3299 | /* TODO: | ||
| 3300 | * - Add checks on RIP | ||
| 3301 | * - Add checks on RFLAGS | ||
| 3302 | */ | ||
| 3303 | |||
| 3304 | return true; | ||
| 3305 | } | ||
| 3306 | |||
| 3307 | static int init_rmode_tss(struct kvm *kvm) | ||
| 3308 | { | ||
| 3309 | gfn_t fn; | ||
| 3310 | u16 data = 0; | ||
| 3311 | int idx, r; | ||
| 3312 | |||
| 3313 | idx = srcu_read_lock(&kvm->srcu); | ||
| 3314 | fn = to_kvm_vmx(kvm)->tss_addr >> PAGE_SHIFT; | ||
| 3315 | r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE); | ||
| 3316 | if (r < 0) | ||
| 3317 | goto out; | ||
| 3318 | data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE; | ||
| 3319 | r = kvm_write_guest_page(kvm, fn++, &data, | ||
| 3320 | TSS_IOPB_BASE_OFFSET, sizeof(u16)); | ||
| 3321 | if (r < 0) | ||
| 3322 | goto out; | ||
| 3323 | r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE); | ||
| 3324 | if (r < 0) | ||
| 3325 | goto out; | ||
| 3326 | r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE); | ||
| 3327 | if (r < 0) | ||
| 3328 | goto out; | ||
| 3329 | data = ~0; | ||
| 3330 | r = kvm_write_guest_page(kvm, fn, &data, | ||
| 3331 | RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1, | ||
| 3332 | sizeof(u8)); | ||
| 3333 | out: | ||
| 3334 | srcu_read_unlock(&kvm->srcu, idx); | ||
| 3335 | return r; | ||
| 3336 | } | ||
| 3337 | |||
| 3338 | static int init_rmode_identity_map(struct kvm *kvm) | ||
| 3339 | { | ||
| 3340 | struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); | ||
| 3341 | int i, idx, r = 0; | ||
| 3342 | kvm_pfn_t identity_map_pfn; | ||
| 3343 | u32 tmp; | ||
| 3344 | |||
| 3345 | /* Protect kvm_vmx->ept_identity_pagetable_done. */ | ||
| 3346 | mutex_lock(&kvm->slots_lock); | ||
| 3347 | |||
| 3348 | if (likely(kvm_vmx->ept_identity_pagetable_done)) | ||
| 3349 | goto out2; | ||
| 3350 | |||
| 3351 | if (!kvm_vmx->ept_identity_map_addr) | ||
| 3352 | kvm_vmx->ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR; | ||
| 3353 | identity_map_pfn = kvm_vmx->ept_identity_map_addr >> PAGE_SHIFT; | ||
| 3354 | |||
| 3355 | r = __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT, | ||
| 3356 | kvm_vmx->ept_identity_map_addr, PAGE_SIZE); | ||
| 3357 | if (r < 0) | ||
| 3358 | goto out2; | ||
| 3359 | |||
| 3360 | idx = srcu_read_lock(&kvm->srcu); | ||
| 3361 | r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE); | ||
| 3362 | if (r < 0) | ||
| 3363 | goto out; | ||
| 3364 | /* Set up identity-mapping pagetable for EPT in real mode */ | ||
| 3365 | for (i = 0; i < PT32_ENT_PER_PAGE; i++) { | ||
| 3366 | tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | | ||
| 3367 | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE); | ||
| 3368 | r = kvm_write_guest_page(kvm, identity_map_pfn, | ||
| 3369 | &tmp, i * sizeof(tmp), sizeof(tmp)); | ||
| 3370 | if (r < 0) | ||
| 3371 | goto out; | ||
| 3372 | } | ||
| 3373 | kvm_vmx->ept_identity_pagetable_done = true; | ||
| 3374 | |||
| 3375 | out: | ||
| 3376 | srcu_read_unlock(&kvm->srcu, idx); | ||
| 3377 | |||
| 3378 | out2: | ||
| 3379 | mutex_unlock(&kvm->slots_lock); | ||
| 3380 | return r; | ||
| 3381 | } | ||
| 3382 | |||
| 3383 | static void seg_setup(int seg) | ||
| 3384 | { | ||
| 3385 | const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; | ||
| 3386 | unsigned int ar; | ||
| 3387 | |||
| 3388 | vmcs_write16(sf->selector, 0); | ||
| 3389 | vmcs_writel(sf->base, 0); | ||
| 3390 | vmcs_write32(sf->limit, 0xffff); | ||
| 3391 | ar = 0x93; | ||
| 3392 | if (seg == VCPU_SREG_CS) | ||
| 3393 | ar |= 0x08; /* code segment */ | ||
| 3394 | |||
| 3395 | vmcs_write32(sf->ar_bytes, ar); | ||
| 3396 | } | ||
| 3397 | |||
| 3398 | static int alloc_apic_access_page(struct kvm *kvm) | ||
| 3399 | { | ||
| 3400 | struct page *page; | ||
| 3401 | int r = 0; | ||
| 3402 | |||
| 3403 | mutex_lock(&kvm->slots_lock); | ||
| 3404 | if (kvm->arch.apic_access_page_done) | ||
| 3405 | goto out; | ||
| 3406 | r = __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, | ||
| 3407 | APIC_DEFAULT_PHYS_BASE, PAGE_SIZE); | ||
| 3408 | if (r) | ||
| 3409 | goto out; | ||
| 3410 | |||
| 3411 | page = gfn_to_page(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT); | ||
| 3412 | if (is_error_page(page)) { | ||
| 3413 | r = -EFAULT; | ||
| 3414 | goto out; | ||
| 3415 | } | ||
| 3416 | |||
| 3417 | /* | ||
| 3418 | * Do not pin the page in memory, so that memory hot-unplug | ||
| 3419 | * is able to migrate it. | ||
| 3420 | */ | ||
| 3421 | put_page(page); | ||
| 3422 | kvm->arch.apic_access_page_done = true; | ||
| 3423 | out: | ||
| 3424 | mutex_unlock(&kvm->slots_lock); | ||
| 3425 | return r; | ||
| 3426 | } | ||
| 3427 | |||
| 3428 | int allocate_vpid(void) | ||
| 3429 | { | ||
| 3430 | int vpid; | ||
| 3431 | |||
| 3432 | if (!enable_vpid) | ||
| 3433 | return 0; | ||
| 3434 | spin_lock(&vmx_vpid_lock); | ||
| 3435 | vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS); | ||
| 3436 | if (vpid < VMX_NR_VPIDS) | ||
| 3437 | __set_bit(vpid, vmx_vpid_bitmap); | ||
| 3438 | else | ||
| 3439 | vpid = 0; | ||
| 3440 | spin_unlock(&vmx_vpid_lock); | ||
| 3441 | return vpid; | ||
| 3442 | } | ||
| 3443 | |||
| 3444 | void free_vpid(int vpid) | ||
| 3445 | { | ||
| 3446 | if (!enable_vpid || vpid == 0) | ||
| 3447 | return; | ||
| 3448 | spin_lock(&vmx_vpid_lock); | ||
| 3449 | __clear_bit(vpid, vmx_vpid_bitmap); | ||
| 3450 | spin_unlock(&vmx_vpid_lock); | ||
| 3451 | } | ||
| 3452 | |||
| 3453 | static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, | ||
| 3454 | u32 msr, int type) | ||
| 3455 | { | ||
| 3456 | int f = sizeof(unsigned long); | ||
| 3457 | |||
| 3458 | if (!cpu_has_vmx_msr_bitmap()) | ||
| 3459 | return; | ||
| 3460 | |||
| 3461 | if (static_branch_unlikely(&enable_evmcs)) | ||
| 3462 | evmcs_touch_msr_bitmap(); | ||
| 3463 | |||
| 3464 | /* | ||
| 3465 | * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals | ||
| 3466 | * have the write-low and read-high bitmap offsets the wrong way round. | ||
| 3467 | * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. | ||
| 3468 | */ | ||
| 3469 | if (msr <= 0x1fff) { | ||
| 3470 | if (type & MSR_TYPE_R) | ||
| 3471 | /* read-low */ | ||
| 3472 | __clear_bit(msr, msr_bitmap + 0x000 / f); | ||
| 3473 | |||
| 3474 | if (type & MSR_TYPE_W) | ||
| 3475 | /* write-low */ | ||
| 3476 | __clear_bit(msr, msr_bitmap + 0x800 / f); | ||
| 3477 | |||
| 3478 | } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { | ||
| 3479 | msr &= 0x1fff; | ||
| 3480 | if (type & MSR_TYPE_R) | ||
| 3481 | /* read-high */ | ||
| 3482 | __clear_bit(msr, msr_bitmap + 0x400 / f); | ||
| 3483 | |||
| 3484 | if (type & MSR_TYPE_W) | ||
| 3485 | /* write-high */ | ||
| 3486 | __clear_bit(msr, msr_bitmap + 0xc00 / f); | ||
| 3487 | |||
| 3488 | } | ||
| 3489 | } | ||
| 3490 | |||
| 3491 | static __always_inline void vmx_enable_intercept_for_msr(unsigned long *msr_bitmap, | ||
| 3492 | u32 msr, int type) | ||
| 3493 | { | ||
| 3494 | int f = sizeof(unsigned long); | ||
| 3495 | |||
| 3496 | if (!cpu_has_vmx_msr_bitmap()) | ||
| 3497 | return; | ||
| 3498 | |||
| 3499 | if (static_branch_unlikely(&enable_evmcs)) | ||
| 3500 | evmcs_touch_msr_bitmap(); | ||
| 3501 | |||
| 3502 | /* | ||
| 3503 | * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals | ||
| 3504 | * have the write-low and read-high bitmap offsets the wrong way round. | ||
| 3505 | * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. | ||
| 3506 | */ | ||
| 3507 | if (msr <= 0x1fff) { | ||
| 3508 | if (type & MSR_TYPE_R) | ||
| 3509 | /* read-low */ | ||
| 3510 | __set_bit(msr, msr_bitmap + 0x000 / f); | ||
| 3511 | |||
| 3512 | if (type & MSR_TYPE_W) | ||
| 3513 | /* write-low */ | ||
| 3514 | __set_bit(msr, msr_bitmap + 0x800 / f); | ||
| 3515 | |||
| 3516 | } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { | ||
| 3517 | msr &= 0x1fff; | ||
| 3518 | if (type & MSR_TYPE_R) | ||
| 3519 | /* read-high */ | ||
| 3520 | __set_bit(msr, msr_bitmap + 0x400 / f); | ||
| 3521 | |||
| 3522 | if (type & MSR_TYPE_W) | ||
| 3523 | /* write-high */ | ||
| 3524 | __set_bit(msr, msr_bitmap + 0xc00 / f); | ||
| 3525 | |||
| 3526 | } | ||
| 3527 | } | ||
| 3528 | |||
| 3529 | static __always_inline void vmx_set_intercept_for_msr(unsigned long *msr_bitmap, | ||
| 3530 | u32 msr, int type, bool value) | ||
| 3531 | { | ||
| 3532 | if (value) | ||
| 3533 | vmx_enable_intercept_for_msr(msr_bitmap, msr, type); | ||
| 3534 | else | ||
| 3535 | vmx_disable_intercept_for_msr(msr_bitmap, msr, type); | ||
| 3536 | } | ||
| 3537 | |||
| 3538 | static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu) | ||
| 3539 | { | ||
| 3540 | u8 mode = 0; | ||
| 3541 | |||
| 3542 | if (cpu_has_secondary_exec_ctrls() && | ||
| 3543 | (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) & | ||
| 3544 | SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) { | ||
| 3545 | mode |= MSR_BITMAP_MODE_X2APIC; | ||
| 3546 | if (enable_apicv && kvm_vcpu_apicv_active(vcpu)) | ||
| 3547 | mode |= MSR_BITMAP_MODE_X2APIC_APICV; | ||
| 3548 | } | ||
| 3549 | |||
| 3550 | return mode; | ||
| 3551 | } | ||
| 3552 | |||
| 3553 | static void vmx_update_msr_bitmap_x2apic(unsigned long *msr_bitmap, | ||
| 3554 | u8 mode) | ||
| 3555 | { | ||
| 3556 | int msr; | ||
| 3557 | |||
| 3558 | for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { | ||
| 3559 | unsigned word = msr / BITS_PER_LONG; | ||
| 3560 | msr_bitmap[word] = (mode & MSR_BITMAP_MODE_X2APIC_APICV) ? 0 : ~0; | ||
| 3561 | msr_bitmap[word + (0x800 / sizeof(long))] = ~0; | ||
| 3562 | } | ||
| 3563 | |||
| 3564 | if (mode & MSR_BITMAP_MODE_X2APIC) { | ||
| 3565 | /* | ||
| 3566 | * TPR reads and writes can be virtualized even if virtual interrupt | ||
| 3567 | * delivery is not in use. | ||
| 3568 | */ | ||
| 3569 | vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW); | ||
| 3570 | if (mode & MSR_BITMAP_MODE_X2APIC_APICV) { | ||
| 3571 | vmx_enable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_R); | ||
| 3572 | vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_EOI), MSR_TYPE_W); | ||
| 3573 | vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W); | ||
| 3574 | } | ||
| 3575 | } | ||
| 3576 | } | ||
| 3577 | |||
| 3578 | void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu) | ||
| 3579 | { | ||
| 3580 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 3581 | unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap; | ||
| 3582 | u8 mode = vmx_msr_bitmap_mode(vcpu); | ||
| 3583 | u8 changed = mode ^ vmx->msr_bitmap_mode; | ||
| 3584 | |||
| 3585 | if (!changed) | ||
| 3586 | return; | ||
| 3587 | |||
| 3588 | if (changed & (MSR_BITMAP_MODE_X2APIC | MSR_BITMAP_MODE_X2APIC_APICV)) | ||
| 3589 | vmx_update_msr_bitmap_x2apic(msr_bitmap, mode); | ||
| 3590 | |||
| 3591 | vmx->msr_bitmap_mode = mode; | ||
| 3592 | } | ||
| 3593 | |||
| 3594 | void pt_update_intercept_for_msr(struct vcpu_vmx *vmx) | ||
| 3595 | { | ||
| 3596 | unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap; | ||
| 3597 | bool flag = !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN); | ||
| 3598 | u32 i; | ||
| 3599 | |||
| 3600 | vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_STATUS, | ||
| 3601 | MSR_TYPE_RW, flag); | ||
| 3602 | vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_OUTPUT_BASE, | ||
| 3603 | MSR_TYPE_RW, flag); | ||
| 3604 | vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_OUTPUT_MASK, | ||
| 3605 | MSR_TYPE_RW, flag); | ||
| 3606 | vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_CR3_MATCH, | ||
| 3607 | MSR_TYPE_RW, flag); | ||
| 3608 | for (i = 0; i < vmx->pt_desc.addr_range; i++) { | ||
| 3609 | vmx_set_intercept_for_msr(msr_bitmap, | ||
| 3610 | MSR_IA32_RTIT_ADDR0_A + i * 2, MSR_TYPE_RW, flag); | ||
| 3611 | vmx_set_intercept_for_msr(msr_bitmap, | ||
| 3612 | MSR_IA32_RTIT_ADDR0_B + i * 2, MSR_TYPE_RW, flag); | ||
| 3613 | } | ||
| 3614 | } | ||
| 3615 | |||
| 3616 | static bool vmx_get_enable_apicv(struct kvm_vcpu *vcpu) | ||
| 3617 | { | ||
| 3618 | return enable_apicv; | ||
| 3619 | } | ||
| 3620 | |||
| 3621 | static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu) | ||
| 3622 | { | ||
| 3623 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 3624 | void *vapic_page; | ||
| 3625 | u32 vppr; | ||
| 3626 | int rvi; | ||
| 3627 | |||
| 3628 | if (WARN_ON_ONCE(!is_guest_mode(vcpu)) || | ||
| 3629 | !nested_cpu_has_vid(get_vmcs12(vcpu)) || | ||
| 3630 | WARN_ON_ONCE(!vmx->nested.virtual_apic_page)) | ||
| 3631 | return false; | ||
| 3632 | |||
| 3633 | rvi = vmx_get_rvi(); | ||
| 3634 | |||
| 3635 | vapic_page = kmap(vmx->nested.virtual_apic_page); | ||
| 3636 | vppr = *((u32 *)(vapic_page + APIC_PROCPRI)); | ||
| 3637 | kunmap(vmx->nested.virtual_apic_page); | ||
| 3638 | |||
| 3639 | return ((rvi & 0xf0) > (vppr & 0xf0)); | ||
| 3640 | } | ||
| 3641 | |||
| 3642 | static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu, | ||
| 3643 | bool nested) | ||
| 3644 | { | ||
| 3645 | #ifdef CONFIG_SMP | ||
| 3646 | int pi_vec = nested ? POSTED_INTR_NESTED_VECTOR : POSTED_INTR_VECTOR; | ||
| 3647 | |||
| 3648 | if (vcpu->mode == IN_GUEST_MODE) { | ||
| 3649 | /* | ||
| 3650 | * The vector of interrupt to be delivered to vcpu had | ||
| 3651 | * been set in PIR before this function. | ||
| 3652 | * | ||
| 3653 | * Following cases will be reached in this block, and | ||
| 3654 | * we always send a notification event in all cases as | ||
| 3655 | * explained below. | ||
| 3656 | * | ||
| 3657 | * Case 1: vcpu keeps in non-root mode. Sending a | ||
| 3658 | * notification event posts the interrupt to vcpu. | ||
| 3659 | * | ||
| 3660 | * Case 2: vcpu exits to root mode and is still | ||
| 3661 | * runnable. PIR will be synced to vIRR before the | ||
| 3662 | * next vcpu entry. Sending a notification event in | ||
| 3663 | * this case has no effect, as vcpu is not in root | ||
| 3664 | * mode. | ||
| 3665 | * | ||
| 3666 | * Case 3: vcpu exits to root mode and is blocked. | ||
| 3667 | * vcpu_block() has already synced PIR to vIRR and | ||
| 3668 | * never blocks vcpu if vIRR is not cleared. Therefore, | ||
| 3669 | * a blocked vcpu here does not wait for any requested | ||
| 3670 | * interrupts in PIR, and sending a notification event | ||
| 3671 | * which has no effect is safe here. | ||
| 3672 | */ | ||
| 3673 | |||
| 3674 | apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec); | ||
| 3675 | return true; | ||
| 3676 | } | ||
| 3677 | #endif | ||
| 3678 | return false; | ||
| 3679 | } | ||
| 3680 | |||
| 3681 | static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu, | ||
| 3682 | int vector) | ||
| 3683 | { | ||
| 3684 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 3685 | |||
| 3686 | if (is_guest_mode(vcpu) && | ||
| 3687 | vector == vmx->nested.posted_intr_nv) { | ||
| 3688 | /* | ||
| 3689 | * If a posted intr is not recognized by hardware, | ||
| 3690 | * we will accomplish it in the next vmentry. | ||
| 3691 | */ | ||
| 3692 | vmx->nested.pi_pending = true; | ||
| 3693 | kvm_make_request(KVM_REQ_EVENT, vcpu); | ||
| 3694 | /* the PIR and ON have been set by L1. */ | ||
| 3695 | if (!kvm_vcpu_trigger_posted_interrupt(vcpu, true)) | ||
| 3696 | kvm_vcpu_kick(vcpu); | ||
| 3697 | return 0; | ||
| 3698 | } | ||
| 3699 | return -1; | ||
| 3700 | } | ||
| 3701 | /* | ||
| 3702 | * Send interrupt to vcpu via posted interrupt way. | ||
| 3703 | * 1. If target vcpu is running(non-root mode), send posted interrupt | ||
| 3704 | * notification to vcpu and hardware will sync PIR to vIRR atomically. | ||
| 3705 | * 2. If target vcpu isn't running(root mode), kick it to pick up the | ||
| 3706 | * interrupt from PIR in next vmentry. | ||
| 3707 | */ | ||
| 3708 | static void vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector) | ||
| 3709 | { | ||
| 3710 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 3711 | int r; | ||
| 3712 | |||
| 3713 | r = vmx_deliver_nested_posted_interrupt(vcpu, vector); | ||
| 3714 | if (!r) | ||
| 3715 | return; | ||
| 3716 | |||
| 3717 | if (pi_test_and_set_pir(vector, &vmx->pi_desc)) | ||
| 3718 | return; | ||
| 3719 | |||
| 3720 | /* If a previous notification has sent the IPI, nothing to do. */ | ||
| 3721 | if (pi_test_and_set_on(&vmx->pi_desc)) | ||
| 3722 | return; | ||
| 3723 | |||
| 3724 | if (!kvm_vcpu_trigger_posted_interrupt(vcpu, false)) | ||
| 3725 | kvm_vcpu_kick(vcpu); | ||
| 3726 | } | ||
| 3727 | |||
| 3728 | /* | ||
| 3729 | * Set up the vmcs's constant host-state fields, i.e., host-state fields that | ||
| 3730 | * will not change in the lifetime of the guest. | ||
| 3731 | * Note that host-state that does change is set elsewhere. E.g., host-state | ||
| 3732 | * that is set differently for each CPU is set in vmx_vcpu_load(), not here. | ||
| 3733 | */ | ||
| 3734 | void vmx_set_constant_host_state(struct vcpu_vmx *vmx) | ||
| 3735 | { | ||
| 3736 | u32 low32, high32; | ||
| 3737 | unsigned long tmpl; | ||
| 3738 | struct desc_ptr dt; | ||
| 3739 | unsigned long cr0, cr3, cr4; | ||
| 3740 | |||
| 3741 | cr0 = read_cr0(); | ||
| 3742 | WARN_ON(cr0 & X86_CR0_TS); | ||
| 3743 | vmcs_writel(HOST_CR0, cr0); /* 22.2.3 */ | ||
| 3744 | |||
| 3745 | /* | ||
| 3746 | * Save the most likely value for this task's CR3 in the VMCS. | ||
| 3747 | * We can't use __get_current_cr3_fast() because we're not atomic. | ||
| 3748 | */ | ||
| 3749 | cr3 = __read_cr3(); | ||
| 3750 | vmcs_writel(HOST_CR3, cr3); /* 22.2.3 FIXME: shadow tables */ | ||
| 3751 | vmx->loaded_vmcs->host_state.cr3 = cr3; | ||
| 3752 | |||
| 3753 | /* Save the most likely value for this task's CR4 in the VMCS. */ | ||
| 3754 | cr4 = cr4_read_shadow(); | ||
| 3755 | vmcs_writel(HOST_CR4, cr4); /* 22.2.3, 22.2.5 */ | ||
| 3756 | vmx->loaded_vmcs->host_state.cr4 = cr4; | ||
| 3757 | |||
| 3758 | vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ | ||
| 3759 | #ifdef CONFIG_X86_64 | ||
| 3760 | /* | ||
| 3761 | * Load null selectors, so we can avoid reloading them in | ||
| 3762 | * vmx_prepare_switch_to_host(), in case userspace uses | ||
| 3763 | * the null selectors too (the expected case). | ||
| 3764 | */ | ||
| 3765 | vmcs_write16(HOST_DS_SELECTOR, 0); | ||
| 3766 | vmcs_write16(HOST_ES_SELECTOR, 0); | ||
| 3767 | #else | ||
| 3768 | vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ | ||
| 3769 | vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */ | ||
| 3770 | #endif | ||
| 3771 | vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ | ||
| 3772 | vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */ | ||
| 3773 | |||
| 3774 | store_idt(&dt); | ||
| 3775 | vmcs_writel(HOST_IDTR_BASE, dt.address); /* 22.2.4 */ | ||
| 3776 | vmx->host_idt_base = dt.address; | ||
| 3777 | |||
| 3778 | vmcs_writel(HOST_RIP, (unsigned long)vmx_vmexit); /* 22.2.5 */ | ||
| 3779 | |||
| 3780 | rdmsr(MSR_IA32_SYSENTER_CS, low32, high32); | ||
| 3781 | vmcs_write32(HOST_IA32_SYSENTER_CS, low32); | ||
| 3782 | rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl); | ||
| 3783 | vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl); /* 22.2.3 */ | ||
| 3784 | |||
| 3785 | if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) { | ||
| 3786 | rdmsr(MSR_IA32_CR_PAT, low32, high32); | ||
| 3787 | vmcs_write64(HOST_IA32_PAT, low32 | ((u64) high32 << 32)); | ||
| 3788 | } | ||
| 3789 | |||
| 3790 | if (cpu_has_load_ia32_efer()) | ||
| 3791 | vmcs_write64(HOST_IA32_EFER, host_efer); | ||
| 3792 | } | ||
| 3793 | |||
| 3794 | void set_cr4_guest_host_mask(struct vcpu_vmx *vmx) | ||
| 3795 | { | ||
| 3796 | vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS; | ||
| 3797 | if (enable_ept) | ||
| 3798 | vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE; | ||
| 3799 | if (is_guest_mode(&vmx->vcpu)) | ||
| 3800 | vmx->vcpu.arch.cr4_guest_owned_bits &= | ||
| 3801 | ~get_vmcs12(&vmx->vcpu)->cr4_guest_host_mask; | ||
| 3802 | vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits); | ||
| 3803 | } | ||
| 3804 | |||
| 3805 | static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx) | ||
| 3806 | { | ||
| 3807 | u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl; | ||
| 3808 | |||
| 3809 | if (!kvm_vcpu_apicv_active(&vmx->vcpu)) | ||
| 3810 | pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR; | ||
| 3811 | |||
| 3812 | if (!enable_vnmi) | ||
| 3813 | pin_based_exec_ctrl &= ~PIN_BASED_VIRTUAL_NMIS; | ||
| 3814 | |||
| 3815 | /* Enable the preemption timer dynamically */ | ||
| 3816 | pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER; | ||
| 3817 | return pin_based_exec_ctrl; | ||
| 3818 | } | ||
| 3819 | |||
| 3820 | static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) | ||
| 3821 | { | ||
| 3822 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 3823 | |||
| 3824 | vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx)); | ||
| 3825 | if (cpu_has_secondary_exec_ctrls()) { | ||
| 3826 | if (kvm_vcpu_apicv_active(vcpu)) | ||
| 3827 | vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL, | ||
| 3828 | SECONDARY_EXEC_APIC_REGISTER_VIRT | | ||
| 3829 | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); | ||
| 3830 | else | ||
| 3831 | vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, | ||
| 3832 | SECONDARY_EXEC_APIC_REGISTER_VIRT | | ||
| 3833 | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); | ||
| 3834 | } | ||
| 3835 | |||
| 3836 | if (cpu_has_vmx_msr_bitmap()) | ||
| 3837 | vmx_update_msr_bitmap(vcpu); | ||
| 3838 | } | ||
| 3839 | |||
| 3840 | u32 vmx_exec_control(struct vcpu_vmx *vmx) | ||
| 3841 | { | ||
| 3842 | u32 exec_control = vmcs_config.cpu_based_exec_ctrl; | ||
| 3843 | |||
| 3844 | if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT) | ||
| 3845 | exec_control &= ~CPU_BASED_MOV_DR_EXITING; | ||
| 3846 | |||
| 3847 | if (!cpu_need_tpr_shadow(&vmx->vcpu)) { | ||
| 3848 | exec_control &= ~CPU_BASED_TPR_SHADOW; | ||
| 3849 | #ifdef CONFIG_X86_64 | ||
| 3850 | exec_control |= CPU_BASED_CR8_STORE_EXITING | | ||
| 3851 | CPU_BASED_CR8_LOAD_EXITING; | ||
| 3852 | #endif | ||
| 3853 | } | ||
| 3854 | if (!enable_ept) | ||
| 3855 | exec_control |= CPU_BASED_CR3_STORE_EXITING | | ||
| 3856 | CPU_BASED_CR3_LOAD_EXITING | | ||
| 3857 | CPU_BASED_INVLPG_EXITING; | ||
| 3858 | if (kvm_mwait_in_guest(vmx->vcpu.kvm)) | ||
| 3859 | exec_control &= ~(CPU_BASED_MWAIT_EXITING | | ||
| 3860 | CPU_BASED_MONITOR_EXITING); | ||
| 3861 | if (kvm_hlt_in_guest(vmx->vcpu.kvm)) | ||
| 3862 | exec_control &= ~CPU_BASED_HLT_EXITING; | ||
| 3863 | return exec_control; | ||
| 3864 | } | ||
| 3865 | |||
| 3866 | |||
| 3867 | static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) | ||
| 3868 | { | ||
| 3869 | struct kvm_vcpu *vcpu = &vmx->vcpu; | ||
| 3870 | |||
| 3871 | u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl; | ||
| 3872 | |||
| 3873 | if (pt_mode == PT_MODE_SYSTEM) | ||
| 3874 | exec_control &= ~(SECONDARY_EXEC_PT_USE_GPA | SECONDARY_EXEC_PT_CONCEAL_VMX); | ||
| 3875 | if (!cpu_need_virtualize_apic_accesses(vcpu)) | ||
| 3876 | exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; | ||
| 3877 | if (vmx->vpid == 0) | ||
| 3878 | exec_control &= ~SECONDARY_EXEC_ENABLE_VPID; | ||
| 3879 | if (!enable_ept) { | ||
| 3880 | exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; | ||
| 3881 | enable_unrestricted_guest = 0; | ||
| 3882 | } | ||
| 3883 | if (!enable_unrestricted_guest) | ||
| 3884 | exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; | ||
| 3885 | if (kvm_pause_in_guest(vmx->vcpu.kvm)) | ||
| 3886 | exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING; | ||
| 3887 | if (!kvm_vcpu_apicv_active(vcpu)) | ||
| 3888 | exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT | | ||
| 3889 | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); | ||
| 3890 | exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; | ||
| 3891 | |||
| 3892 | /* SECONDARY_EXEC_DESC is enabled/disabled on writes to CR4.UMIP, | ||
| 3893 | * in vmx_set_cr4. */ | ||
| 3894 | exec_control &= ~SECONDARY_EXEC_DESC; | ||
| 3895 | |||
| 3896 | /* SECONDARY_EXEC_SHADOW_VMCS is enabled when L1 executes VMPTRLD | ||
| 3897 | (handle_vmptrld). | ||
| 3898 | We can NOT enable shadow_vmcs here because we don't have yet | ||
| 3899 | a current VMCS12 | ||
| 3900 | */ | ||
| 3901 | exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; | ||
| 3902 | |||
| 3903 | if (!enable_pml) | ||
| 3904 | exec_control &= ~SECONDARY_EXEC_ENABLE_PML; | ||
| 3905 | |||
| 3906 | if (vmx_xsaves_supported()) { | ||
| 3907 | /* Exposing XSAVES only when XSAVE is exposed */ | ||
| 3908 | bool xsaves_enabled = | ||
| 3909 | guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && | ||
| 3910 | guest_cpuid_has(vcpu, X86_FEATURE_XSAVES); | ||
| 3911 | |||
| 3912 | if (!xsaves_enabled) | ||
| 3913 | exec_control &= ~SECONDARY_EXEC_XSAVES; | ||
| 3914 | |||
| 3915 | if (nested) { | ||
| 3916 | if (xsaves_enabled) | ||
| 3917 | vmx->nested.msrs.secondary_ctls_high |= | ||
| 3918 | SECONDARY_EXEC_XSAVES; | ||
| 3919 | else | ||
| 3920 | vmx->nested.msrs.secondary_ctls_high &= | ||
| 3921 | ~SECONDARY_EXEC_XSAVES; | ||
| 3922 | } | ||
| 3923 | } | ||
| 3924 | |||
| 3925 | if (vmx_rdtscp_supported()) { | ||
| 3926 | bool rdtscp_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP); | ||
| 3927 | if (!rdtscp_enabled) | ||
| 3928 | exec_control &= ~SECONDARY_EXEC_RDTSCP; | ||
| 3929 | |||
| 3930 | if (nested) { | ||
| 3931 | if (rdtscp_enabled) | ||
| 3932 | vmx->nested.msrs.secondary_ctls_high |= | ||
| 3933 | SECONDARY_EXEC_RDTSCP; | ||
| 3934 | else | ||
| 3935 | vmx->nested.msrs.secondary_ctls_high &= | ||
| 3936 | ~SECONDARY_EXEC_RDTSCP; | ||
| 3937 | } | ||
| 3938 | } | ||
| 3939 | |||
| 3940 | if (vmx_invpcid_supported()) { | ||
| 3941 | /* Exposing INVPCID only when PCID is exposed */ | ||
| 3942 | bool invpcid_enabled = | ||
| 3943 | guest_cpuid_has(vcpu, X86_FEATURE_INVPCID) && | ||
| 3944 | guest_cpuid_has(vcpu, X86_FEATURE_PCID); | ||
| 3945 | |||
| 3946 | if (!invpcid_enabled) { | ||
| 3947 | exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID; | ||
| 3948 | guest_cpuid_clear(vcpu, X86_FEATURE_INVPCID); | ||
| 3949 | } | ||
| 3950 | |||
| 3951 | if (nested) { | ||
| 3952 | if (invpcid_enabled) | ||
| 3953 | vmx->nested.msrs.secondary_ctls_high |= | ||
| 3954 | SECONDARY_EXEC_ENABLE_INVPCID; | ||
| 3955 | else | ||
| 3956 | vmx->nested.msrs.secondary_ctls_high &= | ||
| 3957 | ~SECONDARY_EXEC_ENABLE_INVPCID; | ||
| 3958 | } | ||
| 3959 | } | ||
| 3960 | |||
| 3961 | if (vmx_rdrand_supported()) { | ||
| 3962 | bool rdrand_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDRAND); | ||
| 3963 | if (rdrand_enabled) | ||
| 3964 | exec_control &= ~SECONDARY_EXEC_RDRAND_EXITING; | ||
| 3965 | |||
| 3966 | if (nested) { | ||
| 3967 | if (rdrand_enabled) | ||
| 3968 | vmx->nested.msrs.secondary_ctls_high |= | ||
| 3969 | SECONDARY_EXEC_RDRAND_EXITING; | ||
| 3970 | else | ||
| 3971 | vmx->nested.msrs.secondary_ctls_high &= | ||
| 3972 | ~SECONDARY_EXEC_RDRAND_EXITING; | ||
| 3973 | } | ||
| 3974 | } | ||
| 3975 | |||
| 3976 | if (vmx_rdseed_supported()) { | ||
| 3977 | bool rdseed_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDSEED); | ||
| 3978 | if (rdseed_enabled) | ||
| 3979 | exec_control &= ~SECONDARY_EXEC_RDSEED_EXITING; | ||
| 3980 | |||
| 3981 | if (nested) { | ||
| 3982 | if (rdseed_enabled) | ||
| 3983 | vmx->nested.msrs.secondary_ctls_high |= | ||
| 3984 | SECONDARY_EXEC_RDSEED_EXITING; | ||
| 3985 | else | ||
| 3986 | vmx->nested.msrs.secondary_ctls_high &= | ||
| 3987 | ~SECONDARY_EXEC_RDSEED_EXITING; | ||
| 3988 | } | ||
| 3989 | } | ||
| 3990 | |||
| 3991 | vmx->secondary_exec_control = exec_control; | ||
| 3992 | } | ||
| 3993 | |||
| 3994 | static void ept_set_mmio_spte_mask(void) | ||
| 3995 | { | ||
| 3996 | /* | ||
| 3997 | * EPT Misconfigurations can be generated if the value of bits 2:0 | ||
| 3998 | * of an EPT paging-structure entry is 110b (write/execute). | ||
| 3999 | */ | ||
| 4000 | kvm_mmu_set_mmio_spte_mask(VMX_EPT_RWX_MASK, | ||
| 4001 | VMX_EPT_MISCONFIG_WX_VALUE); | ||
| 4002 | } | ||
| 4003 | |||
| 4004 | #define VMX_XSS_EXIT_BITMAP 0 | ||
| 4005 | |||
| 4006 | /* | ||
| 4007 | * Sets up the vmcs for emulated real mode. | ||
| 4008 | */ | ||
| 4009 | static void vmx_vcpu_setup(struct vcpu_vmx *vmx) | ||
| 4010 | { | ||
| 4011 | int i; | ||
| 4012 | |||
| 4013 | if (nested) | ||
| 4014 | nested_vmx_vcpu_setup(); | ||
| 4015 | |||
| 4016 | if (cpu_has_vmx_msr_bitmap()) | ||
| 4017 | vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap)); | ||
| 4018 | |||
| 4019 | vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */ | ||
| 4020 | |||
| 4021 | /* Control */ | ||
| 4022 | vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx)); | ||
| 4023 | vmx->hv_deadline_tsc = -1; | ||
| 4024 | |||
| 4025 | vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx)); | ||
| 4026 | |||
| 4027 | if (cpu_has_secondary_exec_ctrls()) { | ||
| 4028 | vmx_compute_secondary_exec_control(vmx); | ||
| 4029 | vmcs_write32(SECONDARY_VM_EXEC_CONTROL, | ||
| 4030 | vmx->secondary_exec_control); | ||
| 4031 | } | ||
| 4032 | |||
| 4033 | if (kvm_vcpu_apicv_active(&vmx->vcpu)) { | ||
| 4034 | vmcs_write64(EOI_EXIT_BITMAP0, 0); | ||
| 4035 | vmcs_write64(EOI_EXIT_BITMAP1, 0); | ||
| 4036 | vmcs_write64(EOI_EXIT_BITMAP2, 0); | ||
| 4037 | vmcs_write64(EOI_EXIT_BITMAP3, 0); | ||
| 4038 | |||
| 4039 | vmcs_write16(GUEST_INTR_STATUS, 0); | ||
| 4040 | |||
| 4041 | vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR); | ||
| 4042 | vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc))); | ||
| 4043 | } | ||
| 4044 | |||
| 4045 | if (!kvm_pause_in_guest(vmx->vcpu.kvm)) { | ||
| 4046 | vmcs_write32(PLE_GAP, ple_gap); | ||
| 4047 | vmx->ple_window = ple_window; | ||
| 4048 | vmx->ple_window_dirty = true; | ||
| 4049 | } | ||
| 4050 | |||
| 4051 | vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); | ||
| 4052 | vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); | ||
| 4053 | vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ | ||
| 4054 | |||
| 4055 | vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */ | ||
| 4056 | vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */ | ||
| 4057 | vmx_set_constant_host_state(vmx); | ||
| 4058 | vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */ | ||
| 4059 | vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */ | ||
| 4060 | |||
| 4061 | if (cpu_has_vmx_vmfunc()) | ||
| 4062 | vmcs_write64(VM_FUNCTION_CONTROL, 0); | ||
| 4063 | |||
| 4064 | vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); | ||
| 4065 | vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); | ||
| 4066 | vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val)); | ||
| 4067 | vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); | ||
| 4068 | vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val)); | ||
| 4069 | |||
| 4070 | if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) | ||
| 4071 | vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); | ||
| 4072 | |||
| 4073 | for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) { | ||
| 4074 | u32 index = vmx_msr_index[i]; | ||
| 4075 | u32 data_low, data_high; | ||
| 4076 | int j = vmx->nmsrs; | ||
| 4077 | |||
| 4078 | if (rdmsr_safe(index, &data_low, &data_high) < 0) | ||
| 4079 | continue; | ||
| 4080 | if (wrmsr_safe(index, data_low, data_high) < 0) | ||
| 4081 | continue; | ||
| 4082 | vmx->guest_msrs[j].index = i; | ||
| 4083 | vmx->guest_msrs[j].data = 0; | ||
| 4084 | vmx->guest_msrs[j].mask = -1ull; | ||
| 4085 | ++vmx->nmsrs; | ||
| 4086 | } | ||
| 4087 | |||
| 4088 | vmx->arch_capabilities = kvm_get_arch_capabilities(); | ||
| 4089 | |||
| 4090 | vm_exit_controls_init(vmx, vmx_vmexit_ctrl()); | ||
| 4091 | |||
| 4092 | /* 22.2.1, 20.8.1 */ | ||
| 4093 | vm_entry_controls_init(vmx, vmx_vmentry_ctrl()); | ||
| 4094 | |||
| 4095 | vmx->vcpu.arch.cr0_guest_owned_bits = X86_CR0_TS; | ||
| 4096 | vmcs_writel(CR0_GUEST_HOST_MASK, ~X86_CR0_TS); | ||
| 4097 | |||
| 4098 | set_cr4_guest_host_mask(vmx); | ||
| 4099 | |||
| 4100 | if (vmx_xsaves_supported()) | ||
| 4101 | vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP); | ||
| 4102 | |||
| 4103 | if (enable_pml) { | ||
| 4104 | vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); | ||
| 4105 | vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); | ||
| 4106 | } | ||
| 4107 | |||
| 4108 | if (cpu_has_vmx_encls_vmexit()) | ||
| 4109 | vmcs_write64(ENCLS_EXITING_BITMAP, -1ull); | ||
| 4110 | |||
| 4111 | if (pt_mode == PT_MODE_HOST_GUEST) { | ||
| 4112 | memset(&vmx->pt_desc, 0, sizeof(vmx->pt_desc)); | ||
| 4113 | /* Bit[6~0] are forced to 1, writes are ignored. */ | ||
| 4114 | vmx->pt_desc.guest.output_mask = 0x7F; | ||
| 4115 | vmcs_write64(GUEST_IA32_RTIT_CTL, 0); | ||
| 4116 | } | ||
| 4117 | } | ||
| 4118 | |||
| 4119 | static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) | ||
| 4120 | { | ||
| 4121 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 4122 | struct msr_data apic_base_msr; | ||
| 4123 | u64 cr0; | ||
| 4124 | |||
| 4125 | vmx->rmode.vm86_active = 0; | ||
| 4126 | vmx->spec_ctrl = 0; | ||
| 4127 | |||
| 4128 | vcpu->arch.microcode_version = 0x100000000ULL; | ||
| 4129 | vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val(); | ||
| 4130 | kvm_set_cr8(vcpu, 0); | ||
| 4131 | |||
| 4132 | if (!init_event) { | ||
| 4133 | apic_base_msr.data = APIC_DEFAULT_PHYS_BASE | | ||
| 4134 | MSR_IA32_APICBASE_ENABLE; | ||
| 4135 | if (kvm_vcpu_is_reset_bsp(vcpu)) | ||
| 4136 | apic_base_msr.data |= MSR_IA32_APICBASE_BSP; | ||
| 4137 | apic_base_msr.host_initiated = true; | ||
| 4138 | kvm_set_apic_base(vcpu, &apic_base_msr); | ||
| 4139 | } | ||
| 4140 | |||
| 4141 | vmx_segment_cache_clear(vmx); | ||
| 4142 | |||
| 4143 | seg_setup(VCPU_SREG_CS); | ||
| 4144 | vmcs_write16(GUEST_CS_SELECTOR, 0xf000); | ||
| 4145 | vmcs_writel(GUEST_CS_BASE, 0xffff0000ul); | ||
| 4146 | |||
| 4147 | seg_setup(VCPU_SREG_DS); | ||
| 4148 | seg_setup(VCPU_SREG_ES); | ||
| 4149 | seg_setup(VCPU_SREG_FS); | ||
| 4150 | seg_setup(VCPU_SREG_GS); | ||
| 4151 | seg_setup(VCPU_SREG_SS); | ||
| 4152 | |||
| 4153 | vmcs_write16(GUEST_TR_SELECTOR, 0); | ||
| 4154 | vmcs_writel(GUEST_TR_BASE, 0); | ||
| 4155 | vmcs_write32(GUEST_TR_LIMIT, 0xffff); | ||
| 4156 | vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); | ||
| 4157 | |||
| 4158 | vmcs_write16(GUEST_LDTR_SELECTOR, 0); | ||
| 4159 | vmcs_writel(GUEST_LDTR_BASE, 0); | ||
| 4160 | vmcs_write32(GUEST_LDTR_LIMIT, 0xffff); | ||
| 4161 | vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082); | ||
| 4162 | |||
| 4163 | if (!init_event) { | ||
| 4164 | vmcs_write32(GUEST_SYSENTER_CS, 0); | ||
| 4165 | vmcs_writel(GUEST_SYSENTER_ESP, 0); | ||
| 4166 | vmcs_writel(GUEST_SYSENTER_EIP, 0); | ||
| 4167 | vmcs_write64(GUEST_IA32_DEBUGCTL, 0); | ||
| 4168 | } | ||
| 4169 | |||
| 4170 | kvm_set_rflags(vcpu, X86_EFLAGS_FIXED); | ||
| 4171 | kvm_rip_write(vcpu, 0xfff0); | ||
| 4172 | |||
| 4173 | vmcs_writel(GUEST_GDTR_BASE, 0); | ||
| 4174 | vmcs_write32(GUEST_GDTR_LIMIT, 0xffff); | ||
| 4175 | |||
| 4176 | vmcs_writel(GUEST_IDTR_BASE, 0); | ||
| 4177 | vmcs_write32(GUEST_IDTR_LIMIT, 0xffff); | ||
| 4178 | |||
| 4179 | vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); | ||
| 4180 | vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0); | ||
| 4181 | vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 0); | ||
| 4182 | if (kvm_mpx_supported()) | ||
| 4183 | vmcs_write64(GUEST_BNDCFGS, 0); | ||
| 4184 | |||
| 4185 | setup_msrs(vmx); | ||
| 4186 | |||
| 4187 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */ | ||
| 4188 | |||
| 4189 | if (cpu_has_vmx_tpr_shadow() && !init_event) { | ||
| 4190 | vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0); | ||
| 4191 | if (cpu_need_tpr_shadow(vcpu)) | ||
| 4192 | vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, | ||
| 4193 | __pa(vcpu->arch.apic->regs)); | ||
| 4194 | vmcs_write32(TPR_THRESHOLD, 0); | ||
| 4195 | } | ||
| 4196 | |||
| 4197 | kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); | ||
| 4198 | |||
| 4199 | if (vmx->vpid != 0) | ||
| 4200 | vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); | ||
| 4201 | |||
| 4202 | cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET; | ||
| 4203 | vmx->vcpu.arch.cr0 = cr0; | ||
| 4204 | vmx_set_cr0(vcpu, cr0); /* enter rmode */ | ||
| 4205 | vmx_set_cr4(vcpu, 0); | ||
| 4206 | vmx_set_efer(vcpu, 0); | ||
| 4207 | |||
| 4208 | update_exception_bitmap(vcpu); | ||
| 4209 | |||
| 4210 | vpid_sync_context(vmx->vpid); | ||
| 4211 | if (init_event) | ||
| 4212 | vmx_clear_hlt(vcpu); | ||
| 4213 | } | ||
| 4214 | |||
| 4215 | static void enable_irq_window(struct kvm_vcpu *vcpu) | ||
| 4216 | { | ||
| 4217 | vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL, | ||
| 4218 | CPU_BASED_VIRTUAL_INTR_PENDING); | ||
| 4219 | } | ||
| 4220 | |||
| 4221 | static void enable_nmi_window(struct kvm_vcpu *vcpu) | ||
| 4222 | { | ||
| 4223 | if (!enable_vnmi || | ||
| 4224 | vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) { | ||
| 4225 | enable_irq_window(vcpu); | ||
| 4226 | return; | ||
| 4227 | } | ||
| 4228 | |||
| 4229 | vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL, | ||
| 4230 | CPU_BASED_VIRTUAL_NMI_PENDING); | ||
| 4231 | } | ||
| 4232 | |||
| 4233 | static void vmx_inject_irq(struct kvm_vcpu *vcpu) | ||
| 4234 | { | ||
| 4235 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 4236 | uint32_t intr; | ||
| 4237 | int irq = vcpu->arch.interrupt.nr; | ||
| 4238 | |||
| 4239 | trace_kvm_inj_virq(irq); | ||
| 4240 | |||
| 4241 | ++vcpu->stat.irq_injections; | ||
| 4242 | if (vmx->rmode.vm86_active) { | ||
| 4243 | int inc_eip = 0; | ||
| 4244 | if (vcpu->arch.interrupt.soft) | ||
| 4245 | inc_eip = vcpu->arch.event_exit_inst_len; | ||
| 4246 | if (kvm_inject_realmode_interrupt(vcpu, irq, inc_eip) != EMULATE_DONE) | ||
| 4247 | kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); | ||
| 4248 | return; | ||
| 4249 | } | ||
| 4250 | intr = irq | INTR_INFO_VALID_MASK; | ||
| 4251 | if (vcpu->arch.interrupt.soft) { | ||
| 4252 | intr |= INTR_TYPE_SOFT_INTR; | ||
| 4253 | vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, | ||
| 4254 | vmx->vcpu.arch.event_exit_inst_len); | ||
| 4255 | } else | ||
| 4256 | intr |= INTR_TYPE_EXT_INTR; | ||
| 4257 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr); | ||
| 4258 | |||
| 4259 | vmx_clear_hlt(vcpu); | ||
| 4260 | } | ||
| 4261 | |||
| 4262 | static void vmx_inject_nmi(struct kvm_vcpu *vcpu) | ||
| 4263 | { | ||
| 4264 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 4265 | |||
| 4266 | if (!enable_vnmi) { | ||
| 4267 | /* | ||
| 4268 | * Tracking the NMI-blocked state in software is built upon | ||
| 4269 | * finding the next open IRQ window. This, in turn, depends on | ||
| 4270 | * well-behaving guests: They have to keep IRQs disabled at | ||
| 4271 | * least as long as the NMI handler runs. Otherwise we may | ||
| 4272 | * cause NMI nesting, maybe breaking the guest. But as this is | ||
| 4273 | * highly unlikely, we can live with the residual risk. | ||
| 4274 | */ | ||
| 4275 | vmx->loaded_vmcs->soft_vnmi_blocked = 1; | ||
| 4276 | vmx->loaded_vmcs->vnmi_blocked_time = 0; | ||
| 4277 | } | ||
| 4278 | |||
| 4279 | ++vcpu->stat.nmi_injections; | ||
| 4280 | vmx->loaded_vmcs->nmi_known_unmasked = false; | ||
| 4281 | |||
| 4282 | if (vmx->rmode.vm86_active) { | ||
| 4283 | if (kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0) != EMULATE_DONE) | ||
| 4284 | kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); | ||
| 4285 | return; | ||
| 4286 | } | ||
| 4287 | |||
| 4288 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, | ||
| 4289 | INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); | ||
| 4290 | |||
| 4291 | vmx_clear_hlt(vcpu); | ||
| 4292 | } | ||
| 4293 | |||
| 4294 | bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu) | ||
| 4295 | { | ||
| 4296 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 4297 | bool masked; | ||
| 4298 | |||
| 4299 | if (!enable_vnmi) | ||
| 4300 | return vmx->loaded_vmcs->soft_vnmi_blocked; | ||
| 4301 | if (vmx->loaded_vmcs->nmi_known_unmasked) | ||
| 4302 | return false; | ||
| 4303 | masked = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI; | ||
| 4304 | vmx->loaded_vmcs->nmi_known_unmasked = !masked; | ||
| 4305 | return masked; | ||
| 4306 | } | ||
| 4307 | |||
| 4308 | void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) | ||
| 4309 | { | ||
| 4310 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 4311 | |||
| 4312 | if (!enable_vnmi) { | ||
| 4313 | if (vmx->loaded_vmcs->soft_vnmi_blocked != masked) { | ||
| 4314 | vmx->loaded_vmcs->soft_vnmi_blocked = masked; | ||
| 4315 | vmx->loaded_vmcs->vnmi_blocked_time = 0; | ||
| 4316 | } | ||
| 4317 | } else { | ||
| 4318 | vmx->loaded_vmcs->nmi_known_unmasked = !masked; | ||
| 4319 | if (masked) | ||
| 4320 | vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, | ||
| 4321 | GUEST_INTR_STATE_NMI); | ||
| 4322 | else | ||
| 4323 | vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO, | ||
| 4324 | GUEST_INTR_STATE_NMI); | ||
| 4325 | } | ||
| 4326 | } | ||
| 4327 | |||
| 4328 | static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) | ||
| 4329 | { | ||
| 4330 | if (to_vmx(vcpu)->nested.nested_run_pending) | ||
| 4331 | return 0; | ||
| 4332 | |||
| 4333 | if (!enable_vnmi && | ||
| 4334 | to_vmx(vcpu)->loaded_vmcs->soft_vnmi_blocked) | ||
| 4335 | return 0; | ||
| 4336 | |||
| 4337 | return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & | ||
| 4338 | (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI | ||
| 4339 | | GUEST_INTR_STATE_NMI)); | ||
| 4340 | } | ||
| 4341 | |||
| 4342 | static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu) | ||
| 4343 | { | ||
| 4344 | return (!to_vmx(vcpu)->nested.nested_run_pending && | ||
| 4345 | vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && | ||
| 4346 | !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & | ||
| 4347 | (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)); | ||
| 4348 | } | ||
| 4349 | |||
| 4350 | static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) | ||
| 4351 | { | ||
| 4352 | int ret; | ||
| 4353 | |||
| 4354 | if (enable_unrestricted_guest) | ||
| 4355 | return 0; | ||
| 4356 | |||
| 4357 | ret = x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, addr, | ||
| 4358 | PAGE_SIZE * 3); | ||
| 4359 | if (ret) | ||
| 4360 | return ret; | ||
| 4361 | to_kvm_vmx(kvm)->tss_addr = addr; | ||
| 4362 | return init_rmode_tss(kvm); | ||
| 4363 | } | ||
| 4364 | |||
| 4365 | static int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr) | ||
| 4366 | { | ||
| 4367 | to_kvm_vmx(kvm)->ept_identity_map_addr = ident_addr; | ||
| 4368 | return 0; | ||
| 4369 | } | ||
| 4370 | |||
| 4371 | static bool rmode_exception(struct kvm_vcpu *vcpu, int vec) | ||
| 4372 | { | ||
| 4373 | switch (vec) { | ||
| 4374 | case BP_VECTOR: | ||
| 4375 | /* | ||
| 4376 | * Update instruction length as we may reinject the exception | ||
| 4377 | * from user space while in guest debugging mode. | ||
| 4378 | */ | ||
| 4379 | to_vmx(vcpu)->vcpu.arch.event_exit_inst_len = | ||
| 4380 | vmcs_read32(VM_EXIT_INSTRUCTION_LEN); | ||
| 4381 | if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) | ||
| 4382 | return false; | ||
| 4383 | /* fall through */ | ||
| 4384 | case DB_VECTOR: | ||
| 4385 | if (vcpu->guest_debug & | ||
| 4386 | (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) | ||
| 4387 | return false; | ||
| 4388 | /* fall through */ | ||
| 4389 | case DE_VECTOR: | ||
| 4390 | case OF_VECTOR: | ||
| 4391 | case BR_VECTOR: | ||
| 4392 | case UD_VECTOR: | ||
| 4393 | case DF_VECTOR: | ||
| 4394 | case SS_VECTOR: | ||
| 4395 | case GP_VECTOR: | ||
| 4396 | case MF_VECTOR: | ||
| 4397 | return true; | ||
| 4398 | break; | ||
| 4399 | } | ||
| 4400 | return false; | ||
| 4401 | } | ||
| 4402 | |||
| 4403 | static int handle_rmode_exception(struct kvm_vcpu *vcpu, | ||
| 4404 | int vec, u32 err_code) | ||
| 4405 | { | ||
| 4406 | /* | ||
| 4407 | * Instruction with address size override prefix opcode 0x67 | ||
| 4408 | * Cause the #SS fault with 0 error code in VM86 mode. | ||
| 4409 | */ | ||
| 4410 | if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) { | ||
| 4411 | if (kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE) { | ||
| 4412 | if (vcpu->arch.halt_request) { | ||
| 4413 | vcpu->arch.halt_request = 0; | ||
| 4414 | return kvm_vcpu_halt(vcpu); | ||
| 4415 | } | ||
| 4416 | return 1; | ||
| 4417 | } | ||
| 4418 | return 0; | ||
| 4419 | } | ||
| 4420 | |||
| 4421 | /* | ||
| 4422 | * Forward all other exceptions that are valid in real mode. | ||
| 4423 | * FIXME: Breaks guest debugging in real mode, needs to be fixed with | ||
| 4424 | * the required debugging infrastructure rework. | ||
| 4425 | */ | ||
| 4426 | kvm_queue_exception(vcpu, vec); | ||
| 4427 | return 1; | ||
| 4428 | } | ||
| 4429 | |||
| 4430 | /* | ||
| 4431 | * Trigger machine check on the host. We assume all the MSRs are already set up | ||
| 4432 | * by the CPU and that we still run on the same CPU as the MCE occurred on. | ||
| 4433 | * We pass a fake environment to the machine check handler because we want | ||
| 4434 | * the guest to be always treated like user space, no matter what context | ||
| 4435 | * it used internally. | ||
| 4436 | */ | ||
| 4437 | static void kvm_machine_check(void) | ||
| 4438 | { | ||
| 4439 | #if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_64) | ||
| 4440 | struct pt_regs regs = { | ||
| 4441 | .cs = 3, /* Fake ring 3 no matter what the guest ran on */ | ||
| 4442 | .flags = X86_EFLAGS_IF, | ||
| 4443 | }; | ||
| 4444 | |||
| 4445 | do_machine_check(®s, 0); | ||
| 4446 | #endif | ||
| 4447 | } | ||
| 4448 | |||
| 4449 | static int handle_machine_check(struct kvm_vcpu *vcpu) | ||
| 4450 | { | ||
| 4451 | /* already handled by vcpu_run */ | ||
| 4452 | return 1; | ||
| 4453 | } | ||
| 4454 | |||
| 4455 | static int handle_exception(struct kvm_vcpu *vcpu) | ||
| 4456 | { | ||
| 4457 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 4458 | struct kvm_run *kvm_run = vcpu->run; | ||
| 4459 | u32 intr_info, ex_no, error_code; | ||
| 4460 | unsigned long cr2, rip, dr6; | ||
| 4461 | u32 vect_info; | ||
| 4462 | enum emulation_result er; | ||
| 4463 | |||
| 4464 | vect_info = vmx->idt_vectoring_info; | ||
| 4465 | intr_info = vmx->exit_intr_info; | ||
| 4466 | |||
| 4467 | if (is_machine_check(intr_info)) | ||
| 4468 | return handle_machine_check(vcpu); | ||
| 4469 | |||
| 4470 | if (is_nmi(intr_info)) | ||
| 4471 | return 1; /* already handled by vmx_vcpu_run() */ | ||
| 4472 | |||
| 4473 | if (is_invalid_opcode(intr_info)) | ||
| 4474 | return handle_ud(vcpu); | ||
| 4475 | |||
| 4476 | error_code = 0; | ||
| 4477 | if (intr_info & INTR_INFO_DELIVER_CODE_MASK) | ||
| 4478 | error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); | ||
| 4479 | |||
| 4480 | if (!vmx->rmode.vm86_active && is_gp_fault(intr_info)) { | ||
| 4481 | WARN_ON_ONCE(!enable_vmware_backdoor); | ||
| 4482 | er = kvm_emulate_instruction(vcpu, | ||
| 4483 | EMULTYPE_VMWARE | EMULTYPE_NO_UD_ON_FAIL); | ||
| 4484 | if (er == EMULATE_USER_EXIT) | ||
| 4485 | return 0; | ||
| 4486 | else if (er != EMULATE_DONE) | ||
| 4487 | kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); | ||
| 4488 | return 1; | ||
| 4489 | } | ||
| 4490 | |||
| 4491 | /* | ||
| 4492 | * The #PF with PFEC.RSVD = 1 indicates the guest is accessing | ||
| 4493 | * MMIO, it is better to report an internal error. | ||
| 4494 | * See the comments in vmx_handle_exit. | ||
| 4495 | */ | ||
| 4496 | if ((vect_info & VECTORING_INFO_VALID_MASK) && | ||
| 4497 | !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) { | ||
| 4498 | vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; | ||
| 4499 | vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX; | ||
| 4500 | vcpu->run->internal.ndata = 3; | ||
| 4501 | vcpu->run->internal.data[0] = vect_info; | ||
| 4502 | vcpu->run->internal.data[1] = intr_info; | ||
| 4503 | vcpu->run->internal.data[2] = error_code; | ||
| 4504 | return 0; | ||
| 4505 | } | ||
| 4506 | |||
| 4507 | if (is_page_fault(intr_info)) { | ||
| 4508 | cr2 = vmcs_readl(EXIT_QUALIFICATION); | ||
| 4509 | /* EPT won't cause page fault directly */ | ||
| 4510 | WARN_ON_ONCE(!vcpu->arch.apf.host_apf_reason && enable_ept); | ||
| 4511 | return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0); | ||
| 4512 | } | ||
| 4513 | |||
| 4514 | ex_no = intr_info & INTR_INFO_VECTOR_MASK; | ||
| 4515 | |||
| 4516 | if (vmx->rmode.vm86_active && rmode_exception(vcpu, ex_no)) | ||
| 4517 | return handle_rmode_exception(vcpu, ex_no, error_code); | ||
| 4518 | |||
| 4519 | switch (ex_no) { | ||
| 4520 | case AC_VECTOR: | ||
| 4521 | kvm_queue_exception_e(vcpu, AC_VECTOR, error_code); | ||
| 4522 | return 1; | ||
| 4523 | case DB_VECTOR: | ||
| 4524 | dr6 = vmcs_readl(EXIT_QUALIFICATION); | ||
| 4525 | if (!(vcpu->guest_debug & | ||
| 4526 | (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) { | ||
| 4527 | vcpu->arch.dr6 &= ~15; | ||
| 4528 | vcpu->arch.dr6 |= dr6 | DR6_RTM; | ||
| 4529 | if (is_icebp(intr_info)) | ||
| 4530 | skip_emulated_instruction(vcpu); | ||
| 4531 | |||
| 4532 | kvm_queue_exception(vcpu, DB_VECTOR); | ||
| 4533 | return 1; | ||
| 4534 | } | ||
| 4535 | kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1; | ||
| 4536 | kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7); | ||
| 4537 | /* fall through */ | ||
| 4538 | case BP_VECTOR: | ||
| 4539 | /* | ||
| 4540 | * Update instruction length as we may reinject #BP from | ||
| 4541 | * user space while in guest debugging mode. Reading it for | ||
| 4542 | * #DB as well causes no harm, it is not used in that case. | ||
| 4543 | */ | ||
| 4544 | vmx->vcpu.arch.event_exit_inst_len = | ||
| 4545 | vmcs_read32(VM_EXIT_INSTRUCTION_LEN); | ||
| 4546 | kvm_run->exit_reason = KVM_EXIT_DEBUG; | ||
| 4547 | rip = kvm_rip_read(vcpu); | ||
| 4548 | kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip; | ||
| 4549 | kvm_run->debug.arch.exception = ex_no; | ||
| 4550 | break; | ||
| 4551 | default: | ||
| 4552 | kvm_run->exit_reason = KVM_EXIT_EXCEPTION; | ||
| 4553 | kvm_run->ex.exception = ex_no; | ||
| 4554 | kvm_run->ex.error_code = error_code; | ||
| 4555 | break; | ||
| 4556 | } | ||
| 4557 | return 0; | ||
| 4558 | } | ||
| 4559 | |||
| 4560 | static int handle_external_interrupt(struct kvm_vcpu *vcpu) | ||
| 4561 | { | ||
| 4562 | ++vcpu->stat.irq_exits; | ||
| 4563 | return 1; | ||
| 4564 | } | ||
| 4565 | |||
| 4566 | static int handle_triple_fault(struct kvm_vcpu *vcpu) | ||
| 4567 | { | ||
| 4568 | vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; | ||
| 4569 | vcpu->mmio_needed = 0; | ||
| 4570 | return 0; | ||
| 4571 | } | ||
| 4572 | |||
| 4573 | static int handle_io(struct kvm_vcpu *vcpu) | ||
| 4574 | { | ||
| 4575 | unsigned long exit_qualification; | ||
| 4576 | int size, in, string; | ||
| 4577 | unsigned port; | ||
| 4578 | |||
| 4579 | exit_qualification = vmcs_readl(EXIT_QUALIFICATION); | ||
| 4580 | string = (exit_qualification & 16) != 0; | ||
| 4581 | |||
| 4582 | ++vcpu->stat.io_exits; | ||
| 4583 | |||
| 4584 | if (string) | ||
| 4585 | return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE; | ||
| 4586 | |||
| 4587 | port = exit_qualification >> 16; | ||
| 4588 | size = (exit_qualification & 7) + 1; | ||
| 4589 | in = (exit_qualification & 8) != 0; | ||
| 4590 | |||
| 4591 | return kvm_fast_pio(vcpu, size, port, in); | ||
| 4592 | } | ||
| 4593 | |||
| 4594 | static void | ||
| 4595 | vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) | ||
| 4596 | { | ||
| 4597 | /* | ||
| 4598 | * Patch in the VMCALL instruction: | ||
| 4599 | */ | ||
| 4600 | hypercall[0] = 0x0f; | ||
| 4601 | hypercall[1] = 0x01; | ||
| 4602 | hypercall[2] = 0xc1; | ||
| 4603 | } | ||
| 4604 | |||
| 4605 | /* called to set cr0 as appropriate for a mov-to-cr0 exit. */ | ||
| 4606 | static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val) | ||
| 4607 | { | ||
| 4608 | if (is_guest_mode(vcpu)) { | ||
| 4609 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); | ||
| 4610 | unsigned long orig_val = val; | ||
| 4611 | |||
| 4612 | /* | ||
| 4613 | * We get here when L2 changed cr0 in a way that did not change | ||
| 4614 | * any of L1's shadowed bits (see nested_vmx_exit_handled_cr), | ||
| 4615 | * but did change L0 shadowed bits. So we first calculate the | ||
| 4616 | * effective cr0 value that L1 would like to write into the | ||
| 4617 | * hardware. It consists of the L2-owned bits from the new | ||
| 4618 | * value combined with the L1-owned bits from L1's guest_cr0. | ||
| 4619 | */ | ||
| 4620 | val = (val & ~vmcs12->cr0_guest_host_mask) | | ||
| 4621 | (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask); | ||
| 4622 | |||
| 4623 | if (!nested_guest_cr0_valid(vcpu, val)) | ||
| 4624 | return 1; | ||
| 4625 | |||
| 4626 | if (kvm_set_cr0(vcpu, val)) | ||
| 4627 | return 1; | ||
| 4628 | vmcs_writel(CR0_READ_SHADOW, orig_val); | ||
| 4629 | return 0; | ||
| 4630 | } else { | ||
| 4631 | if (to_vmx(vcpu)->nested.vmxon && | ||
| 4632 | !nested_host_cr0_valid(vcpu, val)) | ||
| 4633 | return 1; | ||
| 4634 | |||
| 4635 | return kvm_set_cr0(vcpu, val); | ||
| 4636 | } | ||
| 4637 | } | ||
| 4638 | |||
| 4639 | static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val) | ||
| 4640 | { | ||
| 4641 | if (is_guest_mode(vcpu)) { | ||
| 4642 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); | ||
| 4643 | unsigned long orig_val = val; | ||
| 4644 | |||
| 4645 | /* analogously to handle_set_cr0 */ | ||
| 4646 | val = (val & ~vmcs12->cr4_guest_host_mask) | | ||
| 4647 | (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask); | ||
| 4648 | if (kvm_set_cr4(vcpu, val)) | ||
| 4649 | return 1; | ||
| 4650 | vmcs_writel(CR4_READ_SHADOW, orig_val); | ||
| 4651 | return 0; | ||
| 4652 | } else | ||
| 4653 | return kvm_set_cr4(vcpu, val); | ||
| 4654 | } | ||
| 4655 | |||
| 4656 | static int handle_desc(struct kvm_vcpu *vcpu) | ||
| 4657 | { | ||
| 4658 | WARN_ON(!(vcpu->arch.cr4 & X86_CR4_UMIP)); | ||
| 4659 | return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE; | ||
| 4660 | } | ||
| 4661 | |||
| 4662 | static int handle_cr(struct kvm_vcpu *vcpu) | ||
| 4663 | { | ||
| 4664 | unsigned long exit_qualification, val; | ||
| 4665 | int cr; | ||
| 4666 | int reg; | ||
| 4667 | int err; | ||
| 4668 | int ret; | ||
| 4669 | |||
| 4670 | exit_qualification = vmcs_readl(EXIT_QUALIFICATION); | ||
| 4671 | cr = exit_qualification & 15; | ||
| 4672 | reg = (exit_qualification >> 8) & 15; | ||
| 4673 | switch ((exit_qualification >> 4) & 3) { | ||
| 4674 | case 0: /* mov to cr */ | ||
| 4675 | val = kvm_register_readl(vcpu, reg); | ||
| 4676 | trace_kvm_cr_write(cr, val); | ||
| 4677 | switch (cr) { | ||
| 4678 | case 0: | ||
| 4679 | err = handle_set_cr0(vcpu, val); | ||
| 4680 | return kvm_complete_insn_gp(vcpu, err); | ||
| 4681 | case 3: | ||
| 4682 | WARN_ON_ONCE(enable_unrestricted_guest); | ||
| 4683 | err = kvm_set_cr3(vcpu, val); | ||
| 4684 | return kvm_complete_insn_gp(vcpu, err); | ||
| 4685 | case 4: | ||
| 4686 | err = handle_set_cr4(vcpu, val); | ||
| 4687 | return kvm_complete_insn_gp(vcpu, err); | ||
| 4688 | case 8: { | ||
| 4689 | u8 cr8_prev = kvm_get_cr8(vcpu); | ||
| 4690 | u8 cr8 = (u8)val; | ||
| 4691 | err = kvm_set_cr8(vcpu, cr8); | ||
| 4692 | ret = kvm_complete_insn_gp(vcpu, err); | ||
| 4693 | if (lapic_in_kernel(vcpu)) | ||
| 4694 | return ret; | ||
| 4695 | if (cr8_prev <= cr8) | ||
| 4696 | return ret; | ||
| 4697 | /* | ||
| 4698 | * TODO: we might be squashing a | ||
| 4699 | * KVM_GUESTDBG_SINGLESTEP-triggered | ||
| 4700 | * KVM_EXIT_DEBUG here. | ||
| 4701 | */ | ||
| 4702 | vcpu->run->exit_reason = KVM_EXIT_SET_TPR; | ||
| 4703 | return 0; | ||
| 4704 | } | ||
| 4705 | } | ||
| 4706 | break; | ||
| 4707 | case 2: /* clts */ | ||
| 4708 | WARN_ONCE(1, "Guest should always own CR0.TS"); | ||
| 4709 | vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); | ||
| 4710 | trace_kvm_cr_write(0, kvm_read_cr0(vcpu)); | ||
| 4711 | return kvm_skip_emulated_instruction(vcpu); | ||
| 4712 | case 1: /*mov from cr*/ | ||
| 4713 | switch (cr) { | ||
| 4714 | case 3: | ||
| 4715 | WARN_ON_ONCE(enable_unrestricted_guest); | ||
| 4716 | val = kvm_read_cr3(vcpu); | ||
| 4717 | kvm_register_write(vcpu, reg, val); | ||
| 4718 | trace_kvm_cr_read(cr, val); | ||
| 4719 | return kvm_skip_emulated_instruction(vcpu); | ||
| 4720 | case 8: | ||
| 4721 | val = kvm_get_cr8(vcpu); | ||
| 4722 | kvm_register_write(vcpu, reg, val); | ||
| 4723 | trace_kvm_cr_read(cr, val); | ||
| 4724 | return kvm_skip_emulated_instruction(vcpu); | ||
| 4725 | } | ||
| 4726 | break; | ||
| 4727 | case 3: /* lmsw */ | ||
| 4728 | val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f; | ||
| 4729 | trace_kvm_cr_write(0, (kvm_read_cr0(vcpu) & ~0xful) | val); | ||
| 4730 | kvm_lmsw(vcpu, val); | ||
| 4731 | |||
| 4732 | return kvm_skip_emulated_instruction(vcpu); | ||
| 4733 | default: | ||
| 4734 | break; | ||
| 4735 | } | ||
| 4736 | vcpu->run->exit_reason = 0; | ||
| 4737 | vcpu_unimpl(vcpu, "unhandled control register: op %d cr %d\n", | ||
| 4738 | (int)(exit_qualification >> 4) & 3, cr); | ||
| 4739 | return 0; | ||
| 4740 | } | ||
| 4741 | |||
| 4742 | static int handle_dr(struct kvm_vcpu *vcpu) | ||
| 4743 | { | ||
| 4744 | unsigned long exit_qualification; | ||
| 4745 | int dr, dr7, reg; | ||
| 4746 | |||
| 4747 | exit_qualification = vmcs_readl(EXIT_QUALIFICATION); | ||
| 4748 | dr = exit_qualification & DEBUG_REG_ACCESS_NUM; | ||
| 4749 | |||
| 4750 | /* First, if DR does not exist, trigger UD */ | ||
| 4751 | if (!kvm_require_dr(vcpu, dr)) | ||
| 4752 | return 1; | ||
| 4753 | |||
| 4754 | /* Do not handle if the CPL > 0, will trigger GP on re-entry */ | ||
| 4755 | if (!kvm_require_cpl(vcpu, 0)) | ||
| 4756 | return 1; | ||
| 4757 | dr7 = vmcs_readl(GUEST_DR7); | ||
| 4758 | if (dr7 & DR7_GD) { | ||
| 4759 | /* | ||
| 4760 | * As the vm-exit takes precedence over the debug trap, we | ||
| 4761 | * need to emulate the latter, either for the host or the | ||
| 4762 | * guest debugging itself. | ||
| 4763 | */ | ||
| 4764 | if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) { | ||
| 4765 | vcpu->run->debug.arch.dr6 = vcpu->arch.dr6; | ||
| 4766 | vcpu->run->debug.arch.dr7 = dr7; | ||
| 4767 | vcpu->run->debug.arch.pc = kvm_get_linear_rip(vcpu); | ||
| 4768 | vcpu->run->debug.arch.exception = DB_VECTOR; | ||
| 4769 | vcpu->run->exit_reason = KVM_EXIT_DEBUG; | ||
| 4770 | return 0; | ||
| 4771 | } else { | ||
| 4772 | vcpu->arch.dr6 &= ~15; | ||
| 4773 | vcpu->arch.dr6 |= DR6_BD | DR6_RTM; | ||
| 4774 | kvm_queue_exception(vcpu, DB_VECTOR); | ||
| 4775 | return 1; | ||
| 4776 | } | ||
| 4777 | } | ||
| 4778 | |||
| 4779 | if (vcpu->guest_debug == 0) { | ||
| 4780 | vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL, | ||
| 4781 | CPU_BASED_MOV_DR_EXITING); | ||
| 4782 | |||
| 4783 | /* | ||
| 4784 | * No more DR vmexits; force a reload of the debug registers | ||
| 4785 | * and reenter on this instruction. The next vmexit will | ||
| 4786 | * retrieve the full state of the debug registers. | ||
| 4787 | */ | ||
| 4788 | vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT; | ||
| 4789 | return 1; | ||
| 4790 | } | ||
| 4791 | |||
| 4792 | reg = DEBUG_REG_ACCESS_REG(exit_qualification); | ||
| 4793 | if (exit_qualification & TYPE_MOV_FROM_DR) { | ||
| 4794 | unsigned long val; | ||
| 4795 | |||
| 4796 | if (kvm_get_dr(vcpu, dr, &val)) | ||
| 4797 | return 1; | ||
| 4798 | kvm_register_write(vcpu, reg, val); | ||
| 4799 | } else | ||
| 4800 | if (kvm_set_dr(vcpu, dr, kvm_register_readl(vcpu, reg))) | ||
| 4801 | return 1; | ||
| 4802 | |||
| 4803 | return kvm_skip_emulated_instruction(vcpu); | ||
| 4804 | } | ||
| 4805 | |||
| 4806 | static u64 vmx_get_dr6(struct kvm_vcpu *vcpu) | ||
| 4807 | { | ||
| 4808 | return vcpu->arch.dr6; | ||
| 4809 | } | ||
| 4810 | |||
| 4811 | static void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val) | ||
| 4812 | { | ||
| 4813 | } | ||
| 4814 | |||
| 4815 | static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) | ||
| 4816 | { | ||
| 4817 | get_debugreg(vcpu->arch.db[0], 0); | ||
| 4818 | get_debugreg(vcpu->arch.db[1], 1); | ||
| 4819 | get_debugreg(vcpu->arch.db[2], 2); | ||
| 4820 | get_debugreg(vcpu->arch.db[3], 3); | ||
| 4821 | get_debugreg(vcpu->arch.dr6, 6); | ||
| 4822 | vcpu->arch.dr7 = vmcs_readl(GUEST_DR7); | ||
| 4823 | |||
| 4824 | vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT; | ||
| 4825 | vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL, CPU_BASED_MOV_DR_EXITING); | ||
| 4826 | } | ||
| 4827 | |||
| 4828 | static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val) | ||
| 4829 | { | ||
| 4830 | vmcs_writel(GUEST_DR7, val); | ||
| 4831 | } | ||
| 4832 | |||
| 4833 | static int handle_cpuid(struct kvm_vcpu *vcpu) | ||
| 4834 | { | ||
| 4835 | return kvm_emulate_cpuid(vcpu); | ||
| 4836 | } | ||
| 4837 | |||
| 4838 | static int handle_rdmsr(struct kvm_vcpu *vcpu) | ||
| 4839 | { | ||
| 4840 | u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX]; | ||
| 4841 | struct msr_data msr_info; | ||
| 4842 | |||
| 4843 | msr_info.index = ecx; | ||
| 4844 | msr_info.host_initiated = false; | ||
| 4845 | if (vmx_get_msr(vcpu, &msr_info)) { | ||
| 4846 | trace_kvm_msr_read_ex(ecx); | ||
| 4847 | kvm_inject_gp(vcpu, 0); | ||
| 4848 | return 1; | ||
| 4849 | } | ||
| 4850 | |||
| 4851 | trace_kvm_msr_read(ecx, msr_info.data); | ||
| 4852 | |||
| 4853 | /* FIXME: handling of bits 32:63 of rax, rdx */ | ||
| 4854 | vcpu->arch.regs[VCPU_REGS_RAX] = msr_info.data & -1u; | ||
| 4855 | vcpu->arch.regs[VCPU_REGS_RDX] = (msr_info.data >> 32) & -1u; | ||
| 4856 | return kvm_skip_emulated_instruction(vcpu); | ||
| 4857 | } | ||
| 4858 | |||
| 4859 | static int handle_wrmsr(struct kvm_vcpu *vcpu) | ||
| 4860 | { | ||
| 4861 | struct msr_data msr; | ||
| 4862 | u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX]; | ||
| 4863 | u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u) | ||
| 4864 | | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32); | ||
| 4865 | |||
| 4866 | msr.data = data; | ||
| 4867 | msr.index = ecx; | ||
| 4868 | msr.host_initiated = false; | ||
| 4869 | if (kvm_set_msr(vcpu, &msr) != 0) { | ||
| 4870 | trace_kvm_msr_write_ex(ecx, data); | ||
| 4871 | kvm_inject_gp(vcpu, 0); | ||
| 4872 | return 1; | ||
| 4873 | } | ||
| 4874 | |||
| 4875 | trace_kvm_msr_write(ecx, data); | ||
| 4876 | return kvm_skip_emulated_instruction(vcpu); | ||
| 4877 | } | ||
| 4878 | |||
| 4879 | static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu) | ||
| 4880 | { | ||
| 4881 | kvm_apic_update_ppr(vcpu); | ||
| 4882 | return 1; | ||
| 4883 | } | ||
| 4884 | |||
| 4885 | static int handle_interrupt_window(struct kvm_vcpu *vcpu) | ||
| 4886 | { | ||
| 4887 | vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL, | ||
| 4888 | CPU_BASED_VIRTUAL_INTR_PENDING); | ||
| 4889 | |||
| 4890 | kvm_make_request(KVM_REQ_EVENT, vcpu); | ||
| 4891 | |||
| 4892 | ++vcpu->stat.irq_window_exits; | ||
| 4893 | return 1; | ||
| 4894 | } | ||
| 4895 | |||
| 4896 | static int handle_halt(struct kvm_vcpu *vcpu) | ||
| 4897 | { | ||
| 4898 | return kvm_emulate_halt(vcpu); | ||
| 4899 | } | ||
| 4900 | |||
| 4901 | static int handle_vmcall(struct kvm_vcpu *vcpu) | ||
| 4902 | { | ||
| 4903 | return kvm_emulate_hypercall(vcpu); | ||
| 4904 | } | ||
| 4905 | |||
| 4906 | static int handle_invd(struct kvm_vcpu *vcpu) | ||
| 4907 | { | ||
| 4908 | return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE; | ||
| 4909 | } | ||
| 4910 | |||
| 4911 | static int handle_invlpg(struct kvm_vcpu *vcpu) | ||
| 4912 | { | ||
| 4913 | unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); | ||
| 4914 | |||
| 4915 | kvm_mmu_invlpg(vcpu, exit_qualification); | ||
| 4916 | return kvm_skip_emulated_instruction(vcpu); | ||
| 4917 | } | ||
| 4918 | |||
| 4919 | static int handle_rdpmc(struct kvm_vcpu *vcpu) | ||
| 4920 | { | ||
| 4921 | int err; | ||
| 4922 | |||
| 4923 | err = kvm_rdpmc(vcpu); | ||
| 4924 | return kvm_complete_insn_gp(vcpu, err); | ||
| 4925 | } | ||
| 4926 | |||
| 4927 | static int handle_wbinvd(struct kvm_vcpu *vcpu) | ||
| 4928 | { | ||
| 4929 | return kvm_emulate_wbinvd(vcpu); | ||
| 4930 | } | ||
| 4931 | |||
| 4932 | static int handle_xsetbv(struct kvm_vcpu *vcpu) | ||
| 4933 | { | ||
| 4934 | u64 new_bv = kvm_read_edx_eax(vcpu); | ||
| 4935 | u32 index = kvm_register_read(vcpu, VCPU_REGS_RCX); | ||
| 4936 | |||
| 4937 | if (kvm_set_xcr(vcpu, index, new_bv) == 0) | ||
| 4938 | return kvm_skip_emulated_instruction(vcpu); | ||
| 4939 | return 1; | ||
| 4940 | } | ||
| 4941 | |||
| 4942 | static int handle_xsaves(struct kvm_vcpu *vcpu) | ||
| 4943 | { | ||
| 4944 | kvm_skip_emulated_instruction(vcpu); | ||
| 4945 | WARN(1, "this should never happen\n"); | ||
| 4946 | return 1; | ||
| 4947 | } | ||
| 4948 | |||
| 4949 | static int handle_xrstors(struct kvm_vcpu *vcpu) | ||
| 4950 | { | ||
| 4951 | kvm_skip_emulated_instruction(vcpu); | ||
| 4952 | WARN(1, "this should never happen\n"); | ||
| 4953 | return 1; | ||
| 4954 | } | ||
| 4955 | |||
| 4956 | static int handle_apic_access(struct kvm_vcpu *vcpu) | ||
| 4957 | { | ||
| 4958 | if (likely(fasteoi)) { | ||
| 4959 | unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); | ||
| 4960 | int access_type, offset; | ||
| 4961 | |||
| 4962 | access_type = exit_qualification & APIC_ACCESS_TYPE; | ||
| 4963 | offset = exit_qualification & APIC_ACCESS_OFFSET; | ||
| 4964 | /* | ||
| 4965 | * Sane guest uses MOV to write EOI, with written value | ||
| 4966 | * not cared. So make a short-circuit here by avoiding | ||
| 4967 | * heavy instruction emulation. | ||
| 4968 | */ | ||
| 4969 | if ((access_type == TYPE_LINEAR_APIC_INST_WRITE) && | ||
| 4970 | (offset == APIC_EOI)) { | ||
| 4971 | kvm_lapic_set_eoi(vcpu); | ||
| 4972 | return kvm_skip_emulated_instruction(vcpu); | ||
| 4973 | } | ||
| 4974 | } | ||
| 4975 | return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE; | ||
| 4976 | } | ||
| 4977 | |||
| 4978 | static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu) | ||
| 4979 | { | ||
| 4980 | unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); | ||
| 4981 | int vector = exit_qualification & 0xff; | ||
| 4982 | |||
| 4983 | /* EOI-induced VM exit is trap-like and thus no need to adjust IP */ | ||
| 4984 | kvm_apic_set_eoi_accelerated(vcpu, vector); | ||
| 4985 | return 1; | ||
| 4986 | } | ||
| 4987 | |||
| 4988 | static int handle_apic_write(struct kvm_vcpu *vcpu) | ||
| 4989 | { | ||
| 4990 | unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); | ||
| 4991 | u32 offset = exit_qualification & 0xfff; | ||
| 4992 | |||
| 4993 | /* APIC-write VM exit is trap-like and thus no need to adjust IP */ | ||
| 4994 | kvm_apic_write_nodecode(vcpu, offset); | ||
| 4995 | return 1; | ||
| 4996 | } | ||
| 4997 | |||
| 4998 | static int handle_task_switch(struct kvm_vcpu *vcpu) | ||
| 4999 | { | ||
| 5000 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 5001 | unsigned long exit_qualification; | ||
| 5002 | bool has_error_code = false; | ||
| 5003 | u32 error_code = 0; | ||
| 5004 | u16 tss_selector; | ||
| 5005 | int reason, type, idt_v, idt_index; | ||
| 5006 | |||
| 5007 | idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK); | ||
| 5008 | idt_index = (vmx->idt_vectoring_info & VECTORING_INFO_VECTOR_MASK); | ||
| 5009 | type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK); | ||
| 5010 | |||
| 5011 | exit_qualification = vmcs_readl(EXIT_QUALIFICATION); | ||
| 5012 | |||
| 5013 | reason = (u32)exit_qualification >> 30; | ||
| 5014 | if (reason == TASK_SWITCH_GATE && idt_v) { | ||
| 5015 | switch (type) { | ||
| 5016 | case INTR_TYPE_NMI_INTR: | ||
| 5017 | vcpu->arch.nmi_injected = false; | ||
| 5018 | vmx_set_nmi_mask(vcpu, true); | ||
| 5019 | break; | ||
| 5020 | case INTR_TYPE_EXT_INTR: | ||
| 5021 | case INTR_TYPE_SOFT_INTR: | ||
| 5022 | kvm_clear_interrupt_queue(vcpu); | ||
| 5023 | break; | ||
| 5024 | case INTR_TYPE_HARD_EXCEPTION: | ||
| 5025 | if (vmx->idt_vectoring_info & | ||
| 5026 | VECTORING_INFO_DELIVER_CODE_MASK) { | ||
| 5027 | has_error_code = true; | ||
| 5028 | error_code = | ||
| 5029 | vmcs_read32(IDT_VECTORING_ERROR_CODE); | ||
| 5030 | } | ||
| 5031 | /* fall through */ | ||
| 5032 | case INTR_TYPE_SOFT_EXCEPTION: | ||
| 5033 | kvm_clear_exception_queue(vcpu); | ||
| 5034 | break; | ||
| 5035 | default: | ||
| 5036 | break; | ||
| 5037 | } | ||
| 5038 | } | ||
| 5039 | tss_selector = exit_qualification; | ||
| 5040 | |||
| 5041 | if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION && | ||
| 5042 | type != INTR_TYPE_EXT_INTR && | ||
| 5043 | type != INTR_TYPE_NMI_INTR)) | ||
| 5044 | skip_emulated_instruction(vcpu); | ||
| 5045 | |||
| 5046 | if (kvm_task_switch(vcpu, tss_selector, | ||
| 5047 | type == INTR_TYPE_SOFT_INTR ? idt_index : -1, reason, | ||
| 5048 | has_error_code, error_code) == EMULATE_FAIL) { | ||
| 5049 | vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; | ||
| 5050 | vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; | ||
| 5051 | vcpu->run->internal.ndata = 0; | ||
| 5052 | return 0; | ||
| 5053 | } | ||
| 5054 | |||
| 5055 | /* | ||
| 5056 | * TODO: What about debug traps on tss switch? | ||
| 5057 | * Are we supposed to inject them and update dr6? | ||
| 5058 | */ | ||
| 5059 | |||
| 5060 | return 1; | ||
| 5061 | } | ||
| 5062 | |||
| 5063 | static int handle_ept_violation(struct kvm_vcpu *vcpu) | ||
| 5064 | { | ||
| 5065 | unsigned long exit_qualification; | ||
| 5066 | gpa_t gpa; | ||
| 5067 | u64 error_code; | ||
| 5068 | |||
| 5069 | exit_qualification = vmcs_readl(EXIT_QUALIFICATION); | ||
| 5070 | |||
| 5071 | /* | ||
| 5072 | * EPT violation happened while executing iret from NMI, | ||
| 5073 | * "blocked by NMI" bit has to be set before next VM entry. | ||
| 5074 | * There are errata that may cause this bit to not be set: | ||
| 5075 | * AAK134, BY25. | ||
| 5076 | */ | ||
| 5077 | if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && | ||
| 5078 | enable_vnmi && | ||
| 5079 | (exit_qualification & INTR_INFO_UNBLOCK_NMI)) | ||
| 5080 | vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI); | ||
| 5081 | |||
| 5082 | gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); | ||
| 5083 | trace_kvm_page_fault(gpa, exit_qualification); | ||
| 5084 | |||
| 5085 | /* Is it a read fault? */ | ||
| 5086 | error_code = (exit_qualification & EPT_VIOLATION_ACC_READ) | ||
| 5087 | ? PFERR_USER_MASK : 0; | ||
| 5088 | /* Is it a write fault? */ | ||
| 5089 | error_code |= (exit_qualification & EPT_VIOLATION_ACC_WRITE) | ||
| 5090 | ? PFERR_WRITE_MASK : 0; | ||
| 5091 | /* Is it a fetch fault? */ | ||
| 5092 | error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR) | ||
| 5093 | ? PFERR_FETCH_MASK : 0; | ||
| 5094 | /* ept page table entry is present? */ | ||
| 5095 | error_code |= (exit_qualification & | ||
| 5096 | (EPT_VIOLATION_READABLE | EPT_VIOLATION_WRITABLE | | ||
| 5097 | EPT_VIOLATION_EXECUTABLE)) | ||
| 5098 | ? PFERR_PRESENT_MASK : 0; | ||
| 5099 | |||
| 5100 | error_code |= (exit_qualification & 0x100) != 0 ? | ||
| 5101 | PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK; | ||
| 5102 | |||
| 5103 | vcpu->arch.exit_qualification = exit_qualification; | ||
| 5104 | return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0); | ||
| 5105 | } | ||
| 5106 | |||
| 5107 | static int handle_ept_misconfig(struct kvm_vcpu *vcpu) | ||
| 5108 | { | ||
| 5109 | gpa_t gpa; | ||
| 5110 | |||
| 5111 | /* | ||
| 5112 | * A nested guest cannot optimize MMIO vmexits, because we have an | ||
| 5113 | * nGPA here instead of the required GPA. | ||
| 5114 | */ | ||
| 5115 | gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); | ||
| 5116 | if (!is_guest_mode(vcpu) && | ||
| 5117 | !kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) { | ||
| 5118 | trace_kvm_fast_mmio(gpa); | ||
| 5119 | /* | ||
| 5120 | * Doing kvm_skip_emulated_instruction() depends on undefined | ||
| 5121 | * behavior: Intel's manual doesn't mandate | ||
| 5122 | * VM_EXIT_INSTRUCTION_LEN to be set in VMCS when EPT MISCONFIG | ||
| 5123 | * occurs and while on real hardware it was observed to be set, | ||
| 5124 | * other hypervisors (namely Hyper-V) don't set it, we end up | ||
| 5125 | * advancing IP with some random value. Disable fast mmio when | ||
| 5126 | * running nested and keep it for real hardware in hope that | ||
| 5127 | * VM_EXIT_INSTRUCTION_LEN will always be set correctly. | ||
| 5128 | */ | ||
| 5129 | if (!static_cpu_has(X86_FEATURE_HYPERVISOR)) | ||
| 5130 | return kvm_skip_emulated_instruction(vcpu); | ||
| 5131 | else | ||
| 5132 | return kvm_emulate_instruction(vcpu, EMULTYPE_SKIP) == | ||
| 5133 | EMULATE_DONE; | ||
| 5134 | } | ||
| 5135 | |||
| 5136 | return kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0); | ||
| 5137 | } | ||
| 5138 | |||
| 5139 | static int handle_nmi_window(struct kvm_vcpu *vcpu) | ||
| 5140 | { | ||
| 5141 | WARN_ON_ONCE(!enable_vnmi); | ||
| 5142 | vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL, | ||
| 5143 | CPU_BASED_VIRTUAL_NMI_PENDING); | ||
| 5144 | ++vcpu->stat.nmi_window_exits; | ||
| 5145 | kvm_make_request(KVM_REQ_EVENT, vcpu); | ||
| 5146 | |||
| 5147 | return 1; | ||
| 5148 | } | ||
| 5149 | |||
| 5150 | static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) | ||
| 5151 | { | ||
| 5152 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 5153 | enum emulation_result err = EMULATE_DONE; | ||
| 5154 | int ret = 1; | ||
| 5155 | u32 cpu_exec_ctrl; | ||
| 5156 | bool intr_window_requested; | ||
| 5157 | unsigned count = 130; | ||
| 5158 | |||
| 5159 | /* | ||
| 5160 | * We should never reach the point where we are emulating L2 | ||
| 5161 | * due to invalid guest state as that means we incorrectly | ||
| 5162 | * allowed a nested VMEntry with an invalid vmcs12. | ||
| 5163 | */ | ||
| 5164 | WARN_ON_ONCE(vmx->emulation_required && vmx->nested.nested_run_pending); | ||
| 5165 | |||
| 5166 | cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); | ||
| 5167 | intr_window_requested = cpu_exec_ctrl & CPU_BASED_VIRTUAL_INTR_PENDING; | ||
| 5168 | |||
| 5169 | while (vmx->emulation_required && count-- != 0) { | ||
| 5170 | if (intr_window_requested && vmx_interrupt_allowed(vcpu)) | ||
| 5171 | return handle_interrupt_window(&vmx->vcpu); | ||
| 5172 | |||
| 5173 | if (kvm_test_request(KVM_REQ_EVENT, vcpu)) | ||
| 5174 | return 1; | ||
| 5175 | |||
| 5176 | err = kvm_emulate_instruction(vcpu, 0); | ||
| 5177 | |||
| 5178 | if (err == EMULATE_USER_EXIT) { | ||
| 5179 | ++vcpu->stat.mmio_exits; | ||
| 5180 | ret = 0; | ||
| 5181 | goto out; | ||
| 5182 | } | ||
| 5183 | |||
| 5184 | if (err != EMULATE_DONE) | ||
| 5185 | goto emulation_error; | ||
| 5186 | |||
| 5187 | if (vmx->emulation_required && !vmx->rmode.vm86_active && | ||
| 5188 | vcpu->arch.exception.pending) | ||
| 5189 | goto emulation_error; | ||
| 5190 | |||
| 5191 | if (vcpu->arch.halt_request) { | ||
| 5192 | vcpu->arch.halt_request = 0; | ||
| 5193 | ret = kvm_vcpu_halt(vcpu); | ||
| 5194 | goto out; | ||
| 5195 | } | ||
| 5196 | |||
| 5197 | if (signal_pending(current)) | ||
| 5198 | goto out; | ||
| 5199 | if (need_resched()) | ||
| 5200 | schedule(); | ||
| 5201 | } | ||
| 5202 | |||
| 5203 | out: | ||
| 5204 | return ret; | ||
| 5205 | |||
| 5206 | emulation_error: | ||
| 5207 | vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; | ||
| 5208 | vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; | ||
| 5209 | vcpu->run->internal.ndata = 0; | ||
| 5210 | return 0; | ||
| 5211 | } | ||
| 5212 | |||
| 5213 | static void grow_ple_window(struct kvm_vcpu *vcpu) | ||
| 5214 | { | ||
| 5215 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 5216 | int old = vmx->ple_window; | ||
| 5217 | |||
| 5218 | vmx->ple_window = __grow_ple_window(old, ple_window, | ||
| 5219 | ple_window_grow, | ||
| 5220 | ple_window_max); | ||
| 5221 | |||
| 5222 | if (vmx->ple_window != old) | ||
| 5223 | vmx->ple_window_dirty = true; | ||
| 5224 | |||
| 5225 | trace_kvm_ple_window_grow(vcpu->vcpu_id, vmx->ple_window, old); | ||
| 5226 | } | ||
| 5227 | |||
| 5228 | static void shrink_ple_window(struct kvm_vcpu *vcpu) | ||
| 5229 | { | ||
| 5230 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 5231 | int old = vmx->ple_window; | ||
| 5232 | |||
| 5233 | vmx->ple_window = __shrink_ple_window(old, ple_window, | ||
| 5234 | ple_window_shrink, | ||
| 5235 | ple_window); | ||
| 5236 | |||
| 5237 | if (vmx->ple_window != old) | ||
| 5238 | vmx->ple_window_dirty = true; | ||
| 5239 | |||
| 5240 | trace_kvm_ple_window_shrink(vcpu->vcpu_id, vmx->ple_window, old); | ||
| 5241 | } | ||
| 5242 | |||
| 5243 | /* | ||
| 5244 | * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR. | ||
| 5245 | */ | ||
| 5246 | static void wakeup_handler(void) | ||
| 5247 | { | ||
| 5248 | struct kvm_vcpu *vcpu; | ||
| 5249 | int cpu = smp_processor_id(); | ||
| 5250 | |||
| 5251 | spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); | ||
| 5252 | list_for_each_entry(vcpu, &per_cpu(blocked_vcpu_on_cpu, cpu), | ||
| 5253 | blocked_vcpu_list) { | ||
| 5254 | struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); | ||
| 5255 | |||
| 5256 | if (pi_test_on(pi_desc) == 1) | ||
| 5257 | kvm_vcpu_kick(vcpu); | ||
| 5258 | } | ||
| 5259 | spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); | ||
| 5260 | } | ||
| 5261 | |||
| 5262 | static void vmx_enable_tdp(void) | ||
| 5263 | { | ||
| 5264 | kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK, | ||
| 5265 | enable_ept_ad_bits ? VMX_EPT_ACCESS_BIT : 0ull, | ||
| 5266 | enable_ept_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull, | ||
| 5267 | 0ull, VMX_EPT_EXECUTABLE_MASK, | ||
| 5268 | cpu_has_vmx_ept_execute_only() ? 0ull : VMX_EPT_READABLE_MASK, | ||
| 5269 | VMX_EPT_RWX_MASK, 0ull); | ||
| 5270 | |||
| 5271 | ept_set_mmio_spte_mask(); | ||
| 5272 | kvm_enable_tdp(); | ||
| 5273 | } | ||
| 5274 | |||
| 5275 | /* | ||
| 5276 | * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE | ||
| 5277 | * exiting, so only get here on cpu with PAUSE-Loop-Exiting. | ||
| 5278 | */ | ||
| 5279 | static int handle_pause(struct kvm_vcpu *vcpu) | ||
| 5280 | { | ||
| 5281 | if (!kvm_pause_in_guest(vcpu->kvm)) | ||
| 5282 | grow_ple_window(vcpu); | ||
| 5283 | |||
| 5284 | /* | ||
| 5285 | * Intel sdm vol3 ch-25.1.3 says: The "PAUSE-loop exiting" | ||
| 5286 | * VM-execution control is ignored if CPL > 0. OTOH, KVM | ||
| 5287 | * never set PAUSE_EXITING and just set PLE if supported, | ||
| 5288 | * so the vcpu must be CPL=0 if it gets a PAUSE exit. | ||
| 5289 | */ | ||
| 5290 | kvm_vcpu_on_spin(vcpu, true); | ||
| 5291 | return kvm_skip_emulated_instruction(vcpu); | ||
| 5292 | } | ||
| 5293 | |||
| 5294 | static int handle_nop(struct kvm_vcpu *vcpu) | ||
| 5295 | { | ||
| 5296 | return kvm_skip_emulated_instruction(vcpu); | ||
| 5297 | } | ||
| 5298 | |||
| 5299 | static int handle_mwait(struct kvm_vcpu *vcpu) | ||
| 5300 | { | ||
| 5301 | printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n"); | ||
| 5302 | return handle_nop(vcpu); | ||
| 5303 | } | ||
| 5304 | |||
| 5305 | static int handle_invalid_op(struct kvm_vcpu *vcpu) | ||
| 5306 | { | ||
| 5307 | kvm_queue_exception(vcpu, UD_VECTOR); | ||
| 5308 | return 1; | ||
| 5309 | } | ||
| 5310 | |||
| 5311 | static int handle_monitor_trap(struct kvm_vcpu *vcpu) | ||
| 5312 | { | ||
| 5313 | return 1; | ||
| 5314 | } | ||
| 5315 | |||
| 5316 | static int handle_monitor(struct kvm_vcpu *vcpu) | ||
| 5317 | { | ||
| 5318 | printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n"); | ||
| 5319 | return handle_nop(vcpu); | ||
| 5320 | } | ||
| 5321 | |||
| 5322 | static int handle_invpcid(struct kvm_vcpu *vcpu) | ||
| 5323 | { | ||
| 5324 | u32 vmx_instruction_info; | ||
| 5325 | unsigned long type; | ||
| 5326 | bool pcid_enabled; | ||
| 5327 | gva_t gva; | ||
| 5328 | struct x86_exception e; | ||
| 5329 | unsigned i; | ||
| 5330 | unsigned long roots_to_free = 0; | ||
| 5331 | struct { | ||
| 5332 | u64 pcid; | ||
| 5333 | u64 gla; | ||
| 5334 | } operand; | ||
| 5335 | |||
| 5336 | if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) { | ||
| 5337 | kvm_queue_exception(vcpu, UD_VECTOR); | ||
| 5338 | return 1; | ||
| 5339 | } | ||
| 5340 | |||
| 5341 | vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); | ||
| 5342 | type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf); | ||
| 5343 | |||
| 5344 | if (type > 3) { | ||
| 5345 | kvm_inject_gp(vcpu, 0); | ||
| 5346 | return 1; | ||
| 5347 | } | ||
| 5348 | |||
| 5349 | /* According to the Intel instruction reference, the memory operand | ||
| 5350 | * is read even if it isn't needed (e.g., for type==all) | ||
| 5351 | */ | ||
| 5352 | if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), | ||
| 5353 | vmx_instruction_info, false, &gva)) | ||
| 5354 | return 1; | ||
| 5355 | |||
| 5356 | if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) { | ||
| 5357 | kvm_inject_page_fault(vcpu, &e); | ||
| 5358 | return 1; | ||
| 5359 | } | ||
| 5360 | |||
| 5361 | if (operand.pcid >> 12 != 0) { | ||
| 5362 | kvm_inject_gp(vcpu, 0); | ||
| 5363 | return 1; | ||
| 5364 | } | ||
| 5365 | |||
| 5366 | pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE); | ||
| 5367 | |||
| 5368 | switch (type) { | ||
| 5369 | case INVPCID_TYPE_INDIV_ADDR: | ||
| 5370 | if ((!pcid_enabled && (operand.pcid != 0)) || | ||
| 5371 | is_noncanonical_address(operand.gla, vcpu)) { | ||
| 5372 | kvm_inject_gp(vcpu, 0); | ||
| 5373 | return 1; | ||
| 5374 | } | ||
| 5375 | kvm_mmu_invpcid_gva(vcpu, operand.gla, operand.pcid); | ||
| 5376 | return kvm_skip_emulated_instruction(vcpu); | ||
| 5377 | |||
| 5378 | case INVPCID_TYPE_SINGLE_CTXT: | ||
| 5379 | if (!pcid_enabled && (operand.pcid != 0)) { | ||
| 5380 | kvm_inject_gp(vcpu, 0); | ||
| 5381 | return 1; | ||
| 5382 | } | ||
| 5383 | |||
| 5384 | if (kvm_get_active_pcid(vcpu) == operand.pcid) { | ||
| 5385 | kvm_mmu_sync_roots(vcpu); | ||
| 5386 | kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); | ||
| 5387 | } | ||
| 5388 | |||
| 5389 | for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) | ||
| 5390 | if (kvm_get_pcid(vcpu, vcpu->arch.mmu->prev_roots[i].cr3) | ||
| 5391 | == operand.pcid) | ||
| 5392 | roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); | ||
| 5393 | |||
| 5394 | kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, roots_to_free); | ||
| 5395 | /* | ||
| 5396 | * If neither the current cr3 nor any of the prev_roots use the | ||
| 5397 | * given PCID, then nothing needs to be done here because a | ||
| 5398 | * resync will happen anyway before switching to any other CR3. | ||
| 5399 | */ | ||
| 5400 | |||
| 5401 | return kvm_skip_emulated_instruction(vcpu); | ||
| 5402 | |||
| 5403 | case INVPCID_TYPE_ALL_NON_GLOBAL: | ||
| 5404 | /* | ||
| 5405 | * Currently, KVM doesn't mark global entries in the shadow | ||
| 5406 | * page tables, so a non-global flush just degenerates to a | ||
| 5407 | * global flush. If needed, we could optimize this later by | ||
| 5408 | * keeping track of global entries in shadow page tables. | ||
| 5409 | */ | ||
| 5410 | |||
| 5411 | /* fall-through */ | ||
| 5412 | case INVPCID_TYPE_ALL_INCL_GLOBAL: | ||
| 5413 | kvm_mmu_unload(vcpu); | ||
| 5414 | return kvm_skip_emulated_instruction(vcpu); | ||
| 5415 | |||
| 5416 | default: | ||
| 5417 | BUG(); /* We have already checked above that type <= 3 */ | ||
| 5418 | } | ||
| 5419 | } | ||
| 5420 | |||
| 5421 | static int handle_pml_full(struct kvm_vcpu *vcpu) | ||
| 5422 | { | ||
| 5423 | unsigned long exit_qualification; | ||
| 5424 | |||
| 5425 | trace_kvm_pml_full(vcpu->vcpu_id); | ||
| 5426 | |||
| 5427 | exit_qualification = vmcs_readl(EXIT_QUALIFICATION); | ||
| 5428 | |||
| 5429 | /* | ||
| 5430 | * PML buffer FULL happened while executing iret from NMI, | ||
| 5431 | * "blocked by NMI" bit has to be set before next VM entry. | ||
| 5432 | */ | ||
| 5433 | if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && | ||
| 5434 | enable_vnmi && | ||
| 5435 | (exit_qualification & INTR_INFO_UNBLOCK_NMI)) | ||
| 5436 | vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, | ||
| 5437 | GUEST_INTR_STATE_NMI); | ||
| 5438 | |||
| 5439 | /* | ||
| 5440 | * PML buffer already flushed at beginning of VMEXIT. Nothing to do | ||
| 5441 | * here.., and there's no userspace involvement needed for PML. | ||
| 5442 | */ | ||
| 5443 | return 1; | ||
| 5444 | } | ||
| 5445 | |||
| 5446 | static int handle_preemption_timer(struct kvm_vcpu *vcpu) | ||
| 5447 | { | ||
| 5448 | if (!to_vmx(vcpu)->req_immediate_exit) | ||
| 5449 | kvm_lapic_expired_hv_timer(vcpu); | ||
| 5450 | return 1; | ||
| 5451 | } | ||
| 5452 | |||
| 5453 | /* | ||
| 5454 | * When nested=0, all VMX instruction VM Exits filter here. The handlers | ||
| 5455 | * are overwritten by nested_vmx_setup() when nested=1. | ||
| 5456 | */ | ||
| 5457 | static int handle_vmx_instruction(struct kvm_vcpu *vcpu) | ||
| 5458 | { | ||
| 5459 | kvm_queue_exception(vcpu, UD_VECTOR); | ||
| 5460 | return 1; | ||
| 5461 | } | ||
| 5462 | |||
| 5463 | static int handle_encls(struct kvm_vcpu *vcpu) | ||
| 5464 | { | ||
| 5465 | /* | ||
| 5466 | * SGX virtualization is not yet supported. There is no software | ||
| 5467 | * enable bit for SGX, so we have to trap ENCLS and inject a #UD | ||
| 5468 | * to prevent the guest from executing ENCLS. | ||
| 5469 | */ | ||
| 5470 | kvm_queue_exception(vcpu, UD_VECTOR); | ||
| 5471 | return 1; | ||
| 5472 | } | ||
| 5473 | |||
| 5474 | /* | ||
| 5475 | * The exit handlers return 1 if the exit was handled fully and guest execution | ||
| 5476 | * may resume. Otherwise they set the kvm_run parameter to indicate what needs | ||
| 5477 | * to be done to userspace and return 0. | ||
| 5478 | */ | ||
| 5479 | static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { | ||
| 5480 | [EXIT_REASON_EXCEPTION_NMI] = handle_exception, | ||
| 5481 | [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt, | ||
| 5482 | [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault, | ||
| 5483 | [EXIT_REASON_NMI_WINDOW] = handle_nmi_window, | ||
| 5484 | [EXIT_REASON_IO_INSTRUCTION] = handle_io, | ||
| 5485 | [EXIT_REASON_CR_ACCESS] = handle_cr, | ||
| 5486 | [EXIT_REASON_DR_ACCESS] = handle_dr, | ||
| 5487 | [EXIT_REASON_CPUID] = handle_cpuid, | ||
| 5488 | [EXIT_REASON_MSR_READ] = handle_rdmsr, | ||
| 5489 | [EXIT_REASON_MSR_WRITE] = handle_wrmsr, | ||
| 5490 | [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window, | ||
| 5491 | [EXIT_REASON_HLT] = handle_halt, | ||
| 5492 | [EXIT_REASON_INVD] = handle_invd, | ||
| 5493 | [EXIT_REASON_INVLPG] = handle_invlpg, | ||
| 5494 | [EXIT_REASON_RDPMC] = handle_rdpmc, | ||
| 5495 | [EXIT_REASON_VMCALL] = handle_vmcall, | ||
| 5496 | [EXIT_REASON_VMCLEAR] = handle_vmx_instruction, | ||
| 5497 | [EXIT_REASON_VMLAUNCH] = handle_vmx_instruction, | ||
| 5498 | [EXIT_REASON_VMPTRLD] = handle_vmx_instruction, | ||
| 5499 | [EXIT_REASON_VMPTRST] = handle_vmx_instruction, | ||
| 5500 | [EXIT_REASON_VMREAD] = handle_vmx_instruction, | ||
| 5501 | [EXIT_REASON_VMRESUME] = handle_vmx_instruction, | ||
| 5502 | [EXIT_REASON_VMWRITE] = handle_vmx_instruction, | ||
| 5503 | [EXIT_REASON_VMOFF] = handle_vmx_instruction, | ||
| 5504 | [EXIT_REASON_VMON] = handle_vmx_instruction, | ||
| 5505 | [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, | ||
| 5506 | [EXIT_REASON_APIC_ACCESS] = handle_apic_access, | ||
| 5507 | [EXIT_REASON_APIC_WRITE] = handle_apic_write, | ||
| 5508 | [EXIT_REASON_EOI_INDUCED] = handle_apic_eoi_induced, | ||
| 5509 | [EXIT_REASON_WBINVD] = handle_wbinvd, | ||
| 5510 | [EXIT_REASON_XSETBV] = handle_xsetbv, | ||
| 5511 | [EXIT_REASON_TASK_SWITCH] = handle_task_switch, | ||
| 5512 | [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check, | ||
| 5513 | [EXIT_REASON_GDTR_IDTR] = handle_desc, | ||
| 5514 | [EXIT_REASON_LDTR_TR] = handle_desc, | ||
| 5515 | [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, | ||
| 5516 | [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig, | ||
| 5517 | [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause, | ||
| 5518 | [EXIT_REASON_MWAIT_INSTRUCTION] = handle_mwait, | ||
| 5519 | [EXIT_REASON_MONITOR_TRAP_FLAG] = handle_monitor_trap, | ||
| 5520 | [EXIT_REASON_MONITOR_INSTRUCTION] = handle_monitor, | ||
| 5521 | [EXIT_REASON_INVEPT] = handle_vmx_instruction, | ||
| 5522 | [EXIT_REASON_INVVPID] = handle_vmx_instruction, | ||
| 5523 | [EXIT_REASON_RDRAND] = handle_invalid_op, | ||
| 5524 | [EXIT_REASON_RDSEED] = handle_invalid_op, | ||
| 5525 | [EXIT_REASON_XSAVES] = handle_xsaves, | ||
| 5526 | [EXIT_REASON_XRSTORS] = handle_xrstors, | ||
| 5527 | [EXIT_REASON_PML_FULL] = handle_pml_full, | ||
| 5528 | [EXIT_REASON_INVPCID] = handle_invpcid, | ||
| 5529 | [EXIT_REASON_VMFUNC] = handle_vmx_instruction, | ||
| 5530 | [EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer, | ||
| 5531 | [EXIT_REASON_ENCLS] = handle_encls, | ||
| 5532 | }; | ||
| 5533 | |||
| 5534 | static const int kvm_vmx_max_exit_handlers = | ||
| 5535 | ARRAY_SIZE(kvm_vmx_exit_handlers); | ||
| 5536 | |||
| 5537 | static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) | ||
| 5538 | { | ||
| 5539 | *info1 = vmcs_readl(EXIT_QUALIFICATION); | ||
| 5540 | *info2 = vmcs_read32(VM_EXIT_INTR_INFO); | ||
| 5541 | } | ||
| 5542 | |||
| 5543 | static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx) | ||
| 5544 | { | ||
| 5545 | if (vmx->pml_pg) { | ||
| 5546 | __free_page(vmx->pml_pg); | ||
| 5547 | vmx->pml_pg = NULL; | ||
| 5548 | } | ||
| 5549 | } | ||
| 5550 | |||
| 5551 | static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu) | ||
| 5552 | { | ||
| 5553 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 5554 | u64 *pml_buf; | ||
| 5555 | u16 pml_idx; | ||
| 5556 | |||
| 5557 | pml_idx = vmcs_read16(GUEST_PML_INDEX); | ||
| 5558 | |||
| 5559 | /* Do nothing if PML buffer is empty */ | ||
| 5560 | if (pml_idx == (PML_ENTITY_NUM - 1)) | ||
| 5561 | return; | ||
| 5562 | |||
| 5563 | /* PML index always points to next available PML buffer entity */ | ||
| 5564 | if (pml_idx >= PML_ENTITY_NUM) | ||
| 5565 | pml_idx = 0; | ||
| 5566 | else | ||
| 5567 | pml_idx++; | ||
| 5568 | |||
| 5569 | pml_buf = page_address(vmx->pml_pg); | ||
| 5570 | for (; pml_idx < PML_ENTITY_NUM; pml_idx++) { | ||
| 5571 | u64 gpa; | ||
| 5572 | |||
| 5573 | gpa = pml_buf[pml_idx]; | ||
| 5574 | WARN_ON(gpa & (PAGE_SIZE - 1)); | ||
| 5575 | kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT); | ||
| 5576 | } | ||
| 5577 | |||
| 5578 | /* reset PML index */ | ||
| 5579 | vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); | ||
| 5580 | } | ||
| 5581 | |||
| 5582 | /* | ||
| 5583 | * Flush all vcpus' PML buffer and update logged GPAs to dirty_bitmap. | ||
| 5584 | * Called before reporting dirty_bitmap to userspace. | ||
| 5585 | */ | ||
| 5586 | static void kvm_flush_pml_buffers(struct kvm *kvm) | ||
| 5587 | { | ||
| 5588 | int i; | ||
| 5589 | struct kvm_vcpu *vcpu; | ||
| 5590 | /* | ||
| 5591 | * We only need to kick vcpu out of guest mode here, as PML buffer | ||
| 5592 | * is flushed at beginning of all VMEXITs, and it's obvious that only | ||
| 5593 | * vcpus running in guest are possible to have unflushed GPAs in PML | ||
| 5594 | * buffer. | ||
| 5595 | */ | ||
| 5596 | kvm_for_each_vcpu(i, vcpu, kvm) | ||
| 5597 | kvm_vcpu_kick(vcpu); | ||
| 5598 | } | ||
| 5599 | |||
| 5600 | static void vmx_dump_sel(char *name, uint32_t sel) | ||
| 5601 | { | ||
| 5602 | pr_err("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016lx\n", | ||
| 5603 | name, vmcs_read16(sel), | ||
| 5604 | vmcs_read32(sel + GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR), | ||
| 5605 | vmcs_read32(sel + GUEST_ES_LIMIT - GUEST_ES_SELECTOR), | ||
| 5606 | vmcs_readl(sel + GUEST_ES_BASE - GUEST_ES_SELECTOR)); | ||
| 5607 | } | ||
| 5608 | |||
| 5609 | static void vmx_dump_dtsel(char *name, uint32_t limit) | ||
| 5610 | { | ||
| 5611 | pr_err("%s limit=0x%08x, base=0x%016lx\n", | ||
| 5612 | name, vmcs_read32(limit), | ||
| 5613 | vmcs_readl(limit + GUEST_GDTR_BASE - GUEST_GDTR_LIMIT)); | ||
| 5614 | } | ||
| 5615 | |||
| 5616 | static void dump_vmcs(void) | ||
| 5617 | { | ||
| 5618 | u32 vmentry_ctl = vmcs_read32(VM_ENTRY_CONTROLS); | ||
| 5619 | u32 vmexit_ctl = vmcs_read32(VM_EXIT_CONTROLS); | ||
| 5620 | u32 cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); | ||
| 5621 | u32 pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL); | ||
| 5622 | u32 secondary_exec_control = 0; | ||
| 5623 | unsigned long cr4 = vmcs_readl(GUEST_CR4); | ||
| 5624 | u64 efer = vmcs_read64(GUEST_IA32_EFER); | ||
| 5625 | int i, n; | ||
| 5626 | |||
| 5627 | if (cpu_has_secondary_exec_ctrls()) | ||
| 5628 | secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); | ||
| 5629 | |||
| 5630 | pr_err("*** Guest State ***\n"); | ||
| 5631 | pr_err("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n", | ||
| 5632 | vmcs_readl(GUEST_CR0), vmcs_readl(CR0_READ_SHADOW), | ||
| 5633 | vmcs_readl(CR0_GUEST_HOST_MASK)); | ||
| 5634 | pr_err("CR4: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n", | ||
| 5635 | cr4, vmcs_readl(CR4_READ_SHADOW), vmcs_readl(CR4_GUEST_HOST_MASK)); | ||
| 5636 | pr_err("CR3 = 0x%016lx\n", vmcs_readl(GUEST_CR3)); | ||
| 5637 | if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT) && | ||
| 5638 | (cr4 & X86_CR4_PAE) && !(efer & EFER_LMA)) | ||
| 5639 | { | ||
| 5640 | pr_err("PDPTR0 = 0x%016llx PDPTR1 = 0x%016llx\n", | ||
| 5641 | vmcs_read64(GUEST_PDPTR0), vmcs_read64(GUEST_PDPTR1)); | ||
| 5642 | pr_err("PDPTR2 = 0x%016llx PDPTR3 = 0x%016llx\n", | ||
| 5643 | vmcs_read64(GUEST_PDPTR2), vmcs_read64(GUEST_PDPTR3)); | ||
| 5644 | } | ||
| 5645 | pr_err("RSP = 0x%016lx RIP = 0x%016lx\n", | ||
| 5646 | vmcs_readl(GUEST_RSP), vmcs_readl(GUEST_RIP)); | ||
| 5647 | pr_err("RFLAGS=0x%08lx DR7 = 0x%016lx\n", | ||
| 5648 | vmcs_readl(GUEST_RFLAGS), vmcs_readl(GUEST_DR7)); | ||
| 5649 | pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n", | ||
| 5650 | vmcs_readl(GUEST_SYSENTER_ESP), | ||
| 5651 | vmcs_read32(GUEST_SYSENTER_CS), vmcs_readl(GUEST_SYSENTER_EIP)); | ||
| 5652 | vmx_dump_sel("CS: ", GUEST_CS_SELECTOR); | ||
| 5653 | vmx_dump_sel("DS: ", GUEST_DS_SELECTOR); | ||
| 5654 | vmx_dump_sel("SS: ", GUEST_SS_SELECTOR); | ||
| 5655 | vmx_dump_sel("ES: ", GUEST_ES_SELECTOR); | ||
| 5656 | vmx_dump_sel("FS: ", GUEST_FS_SELECTOR); | ||
| 5657 | vmx_dump_sel("GS: ", GUEST_GS_SELECTOR); | ||
| 5658 | vmx_dump_dtsel("GDTR:", GUEST_GDTR_LIMIT); | ||
| 5659 | vmx_dump_sel("LDTR:", GUEST_LDTR_SELECTOR); | ||
| 5660 | vmx_dump_dtsel("IDTR:", GUEST_IDTR_LIMIT); | ||
| 5661 | vmx_dump_sel("TR: ", GUEST_TR_SELECTOR); | ||
| 5662 | if ((vmexit_ctl & (VM_EXIT_SAVE_IA32_PAT | VM_EXIT_SAVE_IA32_EFER)) || | ||
| 5663 | (vmentry_ctl & (VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_IA32_EFER))) | ||
| 5664 | pr_err("EFER = 0x%016llx PAT = 0x%016llx\n", | ||
| 5665 | efer, vmcs_read64(GUEST_IA32_PAT)); | ||
| 5666 | pr_err("DebugCtl = 0x%016llx DebugExceptions = 0x%016lx\n", | ||
| 5667 | vmcs_read64(GUEST_IA32_DEBUGCTL), | ||
| 5668 | vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS)); | ||
| 5669 | if (cpu_has_load_perf_global_ctrl() && | ||
| 5670 | vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) | ||
| 5671 | pr_err("PerfGlobCtl = 0x%016llx\n", | ||
| 5672 | vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL)); | ||
| 5673 | if (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS) | ||
| 5674 | pr_err("BndCfgS = 0x%016llx\n", vmcs_read64(GUEST_BNDCFGS)); | ||
| 5675 | pr_err("Interruptibility = %08x ActivityState = %08x\n", | ||
| 5676 | vmcs_read32(GUEST_INTERRUPTIBILITY_INFO), | ||
| 5677 | vmcs_read32(GUEST_ACTIVITY_STATE)); | ||
| 5678 | if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) | ||
| 5679 | pr_err("InterruptStatus = %04x\n", | ||
| 5680 | vmcs_read16(GUEST_INTR_STATUS)); | ||
| 5681 | |||
| 5682 | pr_err("*** Host State ***\n"); | ||
| 5683 | pr_err("RIP = 0x%016lx RSP = 0x%016lx\n", | ||
| 5684 | vmcs_readl(HOST_RIP), vmcs_readl(HOST_RSP)); | ||
| 5685 | pr_err("CS=%04x SS=%04x DS=%04x ES=%04x FS=%04x GS=%04x TR=%04x\n", | ||
| 5686 | vmcs_read16(HOST_CS_SELECTOR), vmcs_read16(HOST_SS_SELECTOR), | ||
| 5687 | vmcs_read16(HOST_DS_SELECTOR), vmcs_read16(HOST_ES_SELECTOR), | ||
| 5688 | vmcs_read16(HOST_FS_SELECTOR), vmcs_read16(HOST_GS_SELECTOR), | ||
| 5689 | vmcs_read16(HOST_TR_SELECTOR)); | ||
| 5690 | pr_err("FSBase=%016lx GSBase=%016lx TRBase=%016lx\n", | ||
| 5691 | vmcs_readl(HOST_FS_BASE), vmcs_readl(HOST_GS_BASE), | ||
| 5692 | vmcs_readl(HOST_TR_BASE)); | ||
| 5693 | pr_err("GDTBase=%016lx IDTBase=%016lx\n", | ||
| 5694 | vmcs_readl(HOST_GDTR_BASE), vmcs_readl(HOST_IDTR_BASE)); | ||
| 5695 | pr_err("CR0=%016lx CR3=%016lx CR4=%016lx\n", | ||
| 5696 | vmcs_readl(HOST_CR0), vmcs_readl(HOST_CR3), | ||
| 5697 | vmcs_readl(HOST_CR4)); | ||
| 5698 | pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n", | ||
| 5699 | vmcs_readl(HOST_IA32_SYSENTER_ESP), | ||
| 5700 | vmcs_read32(HOST_IA32_SYSENTER_CS), | ||
| 5701 | vmcs_readl(HOST_IA32_SYSENTER_EIP)); | ||
| 5702 | if (vmexit_ctl & (VM_EXIT_LOAD_IA32_PAT | VM_EXIT_LOAD_IA32_EFER)) | ||
| 5703 | pr_err("EFER = 0x%016llx PAT = 0x%016llx\n", | ||
| 5704 | vmcs_read64(HOST_IA32_EFER), | ||
| 5705 | vmcs_read64(HOST_IA32_PAT)); | ||
| 5706 | if (cpu_has_load_perf_global_ctrl() && | ||
| 5707 | vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) | ||
| 5708 | pr_err("PerfGlobCtl = 0x%016llx\n", | ||
| 5709 | vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL)); | ||
| 5710 | |||
| 5711 | pr_err("*** Control State ***\n"); | ||
| 5712 | pr_err("PinBased=%08x CPUBased=%08x SecondaryExec=%08x\n", | ||
| 5713 | pin_based_exec_ctrl, cpu_based_exec_ctrl, secondary_exec_control); | ||
| 5714 | pr_err("EntryControls=%08x ExitControls=%08x\n", vmentry_ctl, vmexit_ctl); | ||
| 5715 | pr_err("ExceptionBitmap=%08x PFECmask=%08x PFECmatch=%08x\n", | ||
| 5716 | vmcs_read32(EXCEPTION_BITMAP), | ||
| 5717 | vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK), | ||
| 5718 | vmcs_read32(PAGE_FAULT_ERROR_CODE_MATCH)); | ||
| 5719 | pr_err("VMEntry: intr_info=%08x errcode=%08x ilen=%08x\n", | ||
| 5720 | vmcs_read32(VM_ENTRY_INTR_INFO_FIELD), | ||
| 5721 | vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE), | ||
| 5722 | vmcs_read32(VM_ENTRY_INSTRUCTION_LEN)); | ||
| 5723 | pr_err("VMExit: intr_info=%08x errcode=%08x ilen=%08x\n", | ||
| 5724 | vmcs_read32(VM_EXIT_INTR_INFO), | ||
| 5725 | vmcs_read32(VM_EXIT_INTR_ERROR_CODE), | ||
| 5726 | vmcs_read32(VM_EXIT_INSTRUCTION_LEN)); | ||
| 5727 | pr_err(" reason=%08x qualification=%016lx\n", | ||
| 5728 | vmcs_read32(VM_EXIT_REASON), vmcs_readl(EXIT_QUALIFICATION)); | ||
| 5729 | pr_err("IDTVectoring: info=%08x errcode=%08x\n", | ||
| 5730 | vmcs_read32(IDT_VECTORING_INFO_FIELD), | ||
| 5731 | vmcs_read32(IDT_VECTORING_ERROR_CODE)); | ||
| 5732 | pr_err("TSC Offset = 0x%016llx\n", vmcs_read64(TSC_OFFSET)); | ||
| 5733 | if (secondary_exec_control & SECONDARY_EXEC_TSC_SCALING) | ||
| 5734 | pr_err("TSC Multiplier = 0x%016llx\n", | ||
| 5735 | vmcs_read64(TSC_MULTIPLIER)); | ||
| 5736 | if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW) | ||
| 5737 | pr_err("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD)); | ||
| 5738 | if (pin_based_exec_ctrl & PIN_BASED_POSTED_INTR) | ||
| 5739 | pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV)); | ||
| 5740 | if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT)) | ||
| 5741 | pr_err("EPT pointer = 0x%016llx\n", vmcs_read64(EPT_POINTER)); | ||
| 5742 | n = vmcs_read32(CR3_TARGET_COUNT); | ||
| 5743 | for (i = 0; i + 1 < n; i += 4) | ||
| 5744 | pr_err("CR3 target%u=%016lx target%u=%016lx\n", | ||
| 5745 | i, vmcs_readl(CR3_TARGET_VALUE0 + i * 2), | ||
| 5746 | i + 1, vmcs_readl(CR3_TARGET_VALUE0 + i * 2 + 2)); | ||
| 5747 | if (i < n) | ||
| 5748 | pr_err("CR3 target%u=%016lx\n", | ||
| 5749 | i, vmcs_readl(CR3_TARGET_VALUE0 + i * 2)); | ||
| 5750 | if (secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING) | ||
| 5751 | pr_err("PLE Gap=%08x Window=%08x\n", | ||
| 5752 | vmcs_read32(PLE_GAP), vmcs_read32(PLE_WINDOW)); | ||
| 5753 | if (secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID) | ||
| 5754 | pr_err("Virtual processor ID = 0x%04x\n", | ||
| 5755 | vmcs_read16(VIRTUAL_PROCESSOR_ID)); | ||
| 5756 | } | ||
| 5757 | |||
| 5758 | /* | ||
| 5759 | * The guest has exited. See if we can fix it or if we need userspace | ||
| 5760 | * assistance. | ||
| 5761 | */ | ||
| 5762 | static int vmx_handle_exit(struct kvm_vcpu *vcpu) | ||
| 5763 | { | ||
| 5764 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 5765 | u32 exit_reason = vmx->exit_reason; | ||
| 5766 | u32 vectoring_info = vmx->idt_vectoring_info; | ||
| 5767 | |||
| 5768 | trace_kvm_exit(exit_reason, vcpu, KVM_ISA_VMX); | ||
| 5769 | |||
| 5770 | /* | ||
| 5771 | * Flush logged GPAs PML buffer, this will make dirty_bitmap more | ||
| 5772 | * updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before | ||
| 5773 | * querying dirty_bitmap, we only need to kick all vcpus out of guest | ||
| 5774 | * mode as if vcpus is in root mode, the PML buffer must has been | ||
| 5775 | * flushed already. | ||
| 5776 | */ | ||
| 5777 | if (enable_pml) | ||
| 5778 | vmx_flush_pml_buffer(vcpu); | ||
| 5779 | |||
| 5780 | /* If guest state is invalid, start emulating */ | ||
| 5781 | if (vmx->emulation_required) | ||
| 5782 | return handle_invalid_guest_state(vcpu); | ||
| 5783 | |||
| 5784 | if (is_guest_mode(vcpu) && nested_vmx_exit_reflected(vcpu, exit_reason)) | ||
| 5785 | return nested_vmx_reflect_vmexit(vcpu, exit_reason); | ||
| 5786 | |||
| 5787 | if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) { | ||
| 5788 | dump_vmcs(); | ||
| 5789 | vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; | ||
| 5790 | vcpu->run->fail_entry.hardware_entry_failure_reason | ||
| 5791 | = exit_reason; | ||
| 5792 | return 0; | ||
| 5793 | } | ||
| 5794 | |||
| 5795 | if (unlikely(vmx->fail)) { | ||
| 5796 | vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; | ||
| 5797 | vcpu->run->fail_entry.hardware_entry_failure_reason | ||
| 5798 | = vmcs_read32(VM_INSTRUCTION_ERROR); | ||
| 5799 | return 0; | ||
| 5800 | } | ||
| 5801 | |||
| 5802 | /* | ||
| 5803 | * Note: | ||
| 5804 | * Do not try to fix EXIT_REASON_EPT_MISCONFIG if it caused by | ||
| 5805 | * delivery event since it indicates guest is accessing MMIO. | ||
| 5806 | * The vm-exit can be triggered again after return to guest that | ||
| 5807 | * will cause infinite loop. | ||
| 5808 | */ | ||
| 5809 | if ((vectoring_info & VECTORING_INFO_VALID_MASK) && | ||
| 5810 | (exit_reason != EXIT_REASON_EXCEPTION_NMI && | ||
| 5811 | exit_reason != EXIT_REASON_EPT_VIOLATION && | ||
| 5812 | exit_reason != EXIT_REASON_PML_FULL && | ||
| 5813 | exit_reason != EXIT_REASON_TASK_SWITCH)) { | ||
| 5814 | vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; | ||
| 5815 | vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV; | ||
| 5816 | vcpu->run->internal.ndata = 3; | ||
| 5817 | vcpu->run->internal.data[0] = vectoring_info; | ||
| 5818 | vcpu->run->internal.data[1] = exit_reason; | ||
| 5819 | vcpu->run->internal.data[2] = vcpu->arch.exit_qualification; | ||
| 5820 | if (exit_reason == EXIT_REASON_EPT_MISCONFIG) { | ||
| 5821 | vcpu->run->internal.ndata++; | ||
| 5822 | vcpu->run->internal.data[3] = | ||
| 5823 | vmcs_read64(GUEST_PHYSICAL_ADDRESS); | ||
| 5824 | } | ||
| 5825 | return 0; | ||
| 5826 | } | ||
| 5827 | |||
| 5828 | if (unlikely(!enable_vnmi && | ||
| 5829 | vmx->loaded_vmcs->soft_vnmi_blocked)) { | ||
| 5830 | if (vmx_interrupt_allowed(vcpu)) { | ||
| 5831 | vmx->loaded_vmcs->soft_vnmi_blocked = 0; | ||
| 5832 | } else if (vmx->loaded_vmcs->vnmi_blocked_time > 1000000000LL && | ||
| 5833 | vcpu->arch.nmi_pending) { | ||
| 5834 | /* | ||
| 5835 | * This CPU don't support us in finding the end of an | ||
| 5836 | * NMI-blocked window if the guest runs with IRQs | ||
| 5837 | * disabled. So we pull the trigger after 1 s of | ||
| 5838 | * futile waiting, but inform the user about this. | ||
| 5839 | */ | ||
| 5840 | printk(KERN_WARNING "%s: Breaking out of NMI-blocked " | ||
| 5841 | "state on VCPU %d after 1 s timeout\n", | ||
| 5842 | __func__, vcpu->vcpu_id); | ||
| 5843 | vmx->loaded_vmcs->soft_vnmi_blocked = 0; | ||
| 5844 | } | ||
| 5845 | } | ||
| 5846 | |||
| 5847 | if (exit_reason < kvm_vmx_max_exit_handlers | ||
| 5848 | && kvm_vmx_exit_handlers[exit_reason]) | ||
| 5849 | return kvm_vmx_exit_handlers[exit_reason](vcpu); | ||
| 5850 | else { | ||
| 5851 | vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n", | ||
| 5852 | exit_reason); | ||
| 5853 | kvm_queue_exception(vcpu, UD_VECTOR); | ||
| 5854 | return 1; | ||
| 5855 | } | ||
| 5856 | } | ||
| 5857 | |||
| 5858 | /* | ||
| 5859 | * Software based L1D cache flush which is used when microcode providing | ||
| 5860 | * the cache control MSR is not loaded. | ||
| 5861 | * | ||
| 5862 | * The L1D cache is 32 KiB on Nehalem and later microarchitectures, but to | ||
| 5863 | * flush it is required to read in 64 KiB because the replacement algorithm | ||
| 5864 | * is not exactly LRU. This could be sized at runtime via topology | ||
| 5865 | * information but as all relevant affected CPUs have 32KiB L1D cache size | ||
| 5866 | * there is no point in doing so. | ||
| 5867 | */ | ||
| 5868 | static void vmx_l1d_flush(struct kvm_vcpu *vcpu) | ||
| 5869 | { | ||
| 5870 | int size = PAGE_SIZE << L1D_CACHE_ORDER; | ||
| 5871 | |||
| 5872 | /* | ||
| 5873 | * This code is only executed when the the flush mode is 'cond' or | ||
| 5874 | * 'always' | ||
| 5875 | */ | ||
| 5876 | if (static_branch_likely(&vmx_l1d_flush_cond)) { | ||
| 5877 | bool flush_l1d; | ||
| 5878 | |||
| 5879 | /* | ||
| 5880 | * Clear the per-vcpu flush bit, it gets set again | ||
| 5881 | * either from vcpu_run() or from one of the unsafe | ||
| 5882 | * VMEXIT handlers. | ||
| 5883 | */ | ||
| 5884 | flush_l1d = vcpu->arch.l1tf_flush_l1d; | ||
| 5885 | vcpu->arch.l1tf_flush_l1d = false; | ||
| 5886 | |||
| 5887 | /* | ||
| 5888 | * Clear the per-cpu flush bit, it gets set again from | ||
| 5889 | * the interrupt handlers. | ||
| 5890 | */ | ||
| 5891 | flush_l1d |= kvm_get_cpu_l1tf_flush_l1d(); | ||
| 5892 | kvm_clear_cpu_l1tf_flush_l1d(); | ||
| 5893 | |||
| 5894 | if (!flush_l1d) | ||
| 5895 | return; | ||
| 5896 | } | ||
| 5897 | |||
| 5898 | vcpu->stat.l1d_flush++; | ||
| 5899 | |||
| 5900 | if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) { | ||
| 5901 | wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH); | ||
| 5902 | return; | ||
| 5903 | } | ||
| 5904 | |||
| 5905 | asm volatile( | ||
| 5906 | /* First ensure the pages are in the TLB */ | ||
| 5907 | "xorl %%eax, %%eax\n" | ||
| 5908 | ".Lpopulate_tlb:\n\t" | ||
| 5909 | "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t" | ||
| 5910 | "addl $4096, %%eax\n\t" | ||
| 5911 | "cmpl %%eax, %[size]\n\t" | ||
| 5912 | "jne .Lpopulate_tlb\n\t" | ||
| 5913 | "xorl %%eax, %%eax\n\t" | ||
| 5914 | "cpuid\n\t" | ||
| 5915 | /* Now fill the cache */ | ||
| 5916 | "xorl %%eax, %%eax\n" | ||
| 5917 | ".Lfill_cache:\n" | ||
| 5918 | "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t" | ||
| 5919 | "addl $64, %%eax\n\t" | ||
| 5920 | "cmpl %%eax, %[size]\n\t" | ||
| 5921 | "jne .Lfill_cache\n\t" | ||
| 5922 | "lfence\n" | ||
| 5923 | :: [flush_pages] "r" (vmx_l1d_flush_pages), | ||
| 5924 | [size] "r" (size) | ||
| 5925 | : "eax", "ebx", "ecx", "edx"); | ||
| 5926 | } | ||
| 5927 | |||
| 5928 | static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) | ||
| 5929 | { | ||
| 5930 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); | ||
| 5931 | |||
| 5932 | if (is_guest_mode(vcpu) && | ||
| 5933 | nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) | ||
| 5934 | return; | ||
| 5935 | |||
| 5936 | if (irr == -1 || tpr < irr) { | ||
| 5937 | vmcs_write32(TPR_THRESHOLD, 0); | ||
| 5938 | return; | ||
| 5939 | } | ||
| 5940 | |||
| 5941 | vmcs_write32(TPR_THRESHOLD, irr); | ||
| 5942 | } | ||
| 5943 | |||
| 5944 | void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu) | ||
| 5945 | { | ||
| 5946 | u32 sec_exec_control; | ||
| 5947 | |||
| 5948 | if (!lapic_in_kernel(vcpu)) | ||
| 5949 | return; | ||
| 5950 | |||
| 5951 | if (!flexpriority_enabled && | ||
| 5952 | !cpu_has_vmx_virtualize_x2apic_mode()) | ||
| 5953 | return; | ||
| 5954 | |||
| 5955 | /* Postpone execution until vmcs01 is the current VMCS. */ | ||
| 5956 | if (is_guest_mode(vcpu)) { | ||
| 5957 | to_vmx(vcpu)->nested.change_vmcs01_virtual_apic_mode = true; | ||
| 5958 | return; | ||
| 5959 | } | ||
| 5960 | |||
| 5961 | sec_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); | ||
| 5962 | sec_exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | | ||
| 5963 | SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE); | ||
| 5964 | |||
| 5965 | switch (kvm_get_apic_mode(vcpu)) { | ||
| 5966 | case LAPIC_MODE_INVALID: | ||
| 5967 | WARN_ONCE(true, "Invalid local APIC state"); | ||
| 5968 | case LAPIC_MODE_DISABLED: | ||
| 5969 | break; | ||
| 5970 | case LAPIC_MODE_XAPIC: | ||
| 5971 | if (flexpriority_enabled) { | ||
| 5972 | sec_exec_control |= | ||
| 5973 | SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; | ||
| 5974 | vmx_flush_tlb(vcpu, true); | ||
| 5975 | } | ||
| 5976 | break; | ||
| 5977 | case LAPIC_MODE_X2APIC: | ||
| 5978 | if (cpu_has_vmx_virtualize_x2apic_mode()) | ||
| 5979 | sec_exec_control |= | ||
| 5980 | SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; | ||
| 5981 | break; | ||
| 5982 | } | ||
| 5983 | vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control); | ||
| 5984 | |||
| 5985 | vmx_update_msr_bitmap(vcpu); | ||
| 5986 | } | ||
| 5987 | |||
| 5988 | static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa) | ||
| 5989 | { | ||
| 5990 | if (!is_guest_mode(vcpu)) { | ||
| 5991 | vmcs_write64(APIC_ACCESS_ADDR, hpa); | ||
| 5992 | vmx_flush_tlb(vcpu, true); | ||
| 5993 | } | ||
| 5994 | } | ||
| 5995 | |||
| 5996 | static void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr) | ||
| 5997 | { | ||
| 5998 | u16 status; | ||
| 5999 | u8 old; | ||
| 6000 | |||
| 6001 | if (max_isr == -1) | ||
| 6002 | max_isr = 0; | ||
| 6003 | |||
| 6004 | status = vmcs_read16(GUEST_INTR_STATUS); | ||
| 6005 | old = status >> 8; | ||
| 6006 | if (max_isr != old) { | ||
| 6007 | status &= 0xff; | ||
| 6008 | status |= max_isr << 8; | ||
| 6009 | vmcs_write16(GUEST_INTR_STATUS, status); | ||
| 6010 | } | ||
| 6011 | } | ||
| 6012 | |||
| 6013 | static void vmx_set_rvi(int vector) | ||
| 6014 | { | ||
| 6015 | u16 status; | ||
| 6016 | u8 old; | ||
| 6017 | |||
| 6018 | if (vector == -1) | ||
| 6019 | vector = 0; | ||
| 6020 | |||
| 6021 | status = vmcs_read16(GUEST_INTR_STATUS); | ||
| 6022 | old = (u8)status & 0xff; | ||
| 6023 | if ((u8)vector != old) { | ||
| 6024 | status &= ~0xff; | ||
| 6025 | status |= (u8)vector; | ||
| 6026 | vmcs_write16(GUEST_INTR_STATUS, status); | ||
| 6027 | } | ||
| 6028 | } | ||
| 6029 | |||
| 6030 | static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr) | ||
| 6031 | { | ||
| 6032 | /* | ||
| 6033 | * When running L2, updating RVI is only relevant when | ||
| 6034 | * vmcs12 virtual-interrupt-delivery enabled. | ||
| 6035 | * However, it can be enabled only when L1 also | ||
| 6036 | * intercepts external-interrupts and in that case | ||
| 6037 | * we should not update vmcs02 RVI but instead intercept | ||
| 6038 | * interrupt. Therefore, do nothing when running L2. | ||
| 6039 | */ | ||
| 6040 | if (!is_guest_mode(vcpu)) | ||
| 6041 | vmx_set_rvi(max_irr); | ||
| 6042 | } | ||
| 6043 | |||
| 6044 | static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) | ||
| 6045 | { | ||
| 6046 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 6047 | int max_irr; | ||
| 6048 | bool max_irr_updated; | ||
| 6049 | |||
| 6050 | WARN_ON(!vcpu->arch.apicv_active); | ||
| 6051 | if (pi_test_on(&vmx->pi_desc)) { | ||
| 6052 | pi_clear_on(&vmx->pi_desc); | ||
| 6053 | /* | ||
| 6054 | * IOMMU can write to PIR.ON, so the barrier matters even on UP. | ||
| 6055 | * But on x86 this is just a compiler barrier anyway. | ||
| 6056 | */ | ||
| 6057 | smp_mb__after_atomic(); | ||
| 6058 | max_irr_updated = | ||
| 6059 | kvm_apic_update_irr(vcpu, vmx->pi_desc.pir, &max_irr); | ||
| 6060 | |||
| 6061 | /* | ||
| 6062 | * If we are running L2 and L1 has a new pending interrupt | ||
| 6063 | * which can be injected, we should re-evaluate | ||
| 6064 | * what should be done with this new L1 interrupt. | ||
| 6065 | * If L1 intercepts external-interrupts, we should | ||
| 6066 | * exit from L2 to L1. Otherwise, interrupt should be | ||
| 6067 | * delivered directly to L2. | ||
| 6068 | */ | ||
| 6069 | if (is_guest_mode(vcpu) && max_irr_updated) { | ||
| 6070 | if (nested_exit_on_intr(vcpu)) | ||
| 6071 | kvm_vcpu_exiting_guest_mode(vcpu); | ||
| 6072 | else | ||
| 6073 | kvm_make_request(KVM_REQ_EVENT, vcpu); | ||
| 6074 | } | ||
| 6075 | } else { | ||
| 6076 | max_irr = kvm_lapic_find_highest_irr(vcpu); | ||
| 6077 | } | ||
| 6078 | vmx_hwapic_irr_update(vcpu, max_irr); | ||
| 6079 | return max_irr; | ||
| 6080 | } | ||
| 6081 | |||
| 6082 | static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) | ||
| 6083 | { | ||
| 6084 | if (!kvm_vcpu_apicv_active(vcpu)) | ||
| 6085 | return; | ||
| 6086 | |||
| 6087 | vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]); | ||
| 6088 | vmcs_write64(EOI_EXIT_BITMAP1, eoi_exit_bitmap[1]); | ||
| 6089 | vmcs_write64(EOI_EXIT_BITMAP2, eoi_exit_bitmap[2]); | ||
| 6090 | vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]); | ||
| 6091 | } | ||
| 6092 | |||
| 6093 | static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu) | ||
| 6094 | { | ||
| 6095 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 6096 | |||
| 6097 | pi_clear_on(&vmx->pi_desc); | ||
| 6098 | memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir)); | ||
| 6099 | } | ||
| 6100 | |||
| 6101 | static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) | ||
| 6102 | { | ||
| 6103 | u32 exit_intr_info = 0; | ||
| 6104 | u16 basic_exit_reason = (u16)vmx->exit_reason; | ||
| 6105 | |||
| 6106 | if (!(basic_exit_reason == EXIT_REASON_MCE_DURING_VMENTRY | ||
| 6107 | || basic_exit_reason == EXIT_REASON_EXCEPTION_NMI)) | ||
| 6108 | return; | ||
| 6109 | |||
| 6110 | if (!(vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) | ||
| 6111 | exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); | ||
| 6112 | vmx->exit_intr_info = exit_intr_info; | ||
| 6113 | |||
| 6114 | /* if exit due to PF check for async PF */ | ||
| 6115 | if (is_page_fault(exit_intr_info)) | ||
| 6116 | vmx->vcpu.arch.apf.host_apf_reason = kvm_read_and_reset_pf_reason(); | ||
| 6117 | |||
| 6118 | /* Handle machine checks before interrupts are enabled */ | ||
| 6119 | if (basic_exit_reason == EXIT_REASON_MCE_DURING_VMENTRY || | ||
| 6120 | is_machine_check(exit_intr_info)) | ||
| 6121 | kvm_machine_check(); | ||
| 6122 | |||
| 6123 | /* We need to handle NMIs before interrupts are enabled */ | ||
| 6124 | if (is_nmi(exit_intr_info)) { | ||
| 6125 | kvm_before_interrupt(&vmx->vcpu); | ||
| 6126 | asm("int $2"); | ||
| 6127 | kvm_after_interrupt(&vmx->vcpu); | ||
| 6128 | } | ||
| 6129 | } | ||
| 6130 | |||
| 6131 | static void vmx_handle_external_intr(struct kvm_vcpu *vcpu) | ||
| 6132 | { | ||
| 6133 | u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); | ||
| 6134 | |||
| 6135 | if ((exit_intr_info & (INTR_INFO_VALID_MASK | INTR_INFO_INTR_TYPE_MASK)) | ||
| 6136 | == (INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR)) { | ||
| 6137 | unsigned int vector; | ||
| 6138 | unsigned long entry; | ||
| 6139 | gate_desc *desc; | ||
| 6140 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 6141 | #ifdef CONFIG_X86_64 | ||
| 6142 | unsigned long tmp; | ||
| 6143 | #endif | ||
| 6144 | |||
| 6145 | vector = exit_intr_info & INTR_INFO_VECTOR_MASK; | ||
| 6146 | desc = (gate_desc *)vmx->host_idt_base + vector; | ||
| 6147 | entry = gate_offset(desc); | ||
| 6148 | asm volatile( | ||
| 6149 | #ifdef CONFIG_X86_64 | ||
| 6150 | "mov %%" _ASM_SP ", %[sp]\n\t" | ||
| 6151 | "and $0xfffffffffffffff0, %%" _ASM_SP "\n\t" | ||
| 6152 | "push $%c[ss]\n\t" | ||
| 6153 | "push %[sp]\n\t" | ||
| 6154 | #endif | ||
| 6155 | "pushf\n\t" | ||
| 6156 | __ASM_SIZE(push) " $%c[cs]\n\t" | ||
| 6157 | CALL_NOSPEC | ||
| 6158 | : | ||
| 6159 | #ifdef CONFIG_X86_64 | ||
| 6160 | [sp]"=&r"(tmp), | ||
| 6161 | #endif | ||
| 6162 | ASM_CALL_CONSTRAINT | ||
| 6163 | : | ||
| 6164 | THUNK_TARGET(entry), | ||
| 6165 | [ss]"i"(__KERNEL_DS), | ||
| 6166 | [cs]"i"(__KERNEL_CS) | ||
| 6167 | ); | ||
| 6168 | } | ||
| 6169 | } | ||
| 6170 | STACK_FRAME_NON_STANDARD(vmx_handle_external_intr); | ||
| 6171 | |||
| 6172 | static bool vmx_has_emulated_msr(int index) | ||
| 6173 | { | ||
| 6174 | switch (index) { | ||
| 6175 | case MSR_IA32_SMBASE: | ||
| 6176 | /* | ||
| 6177 | * We cannot do SMM unless we can run the guest in big | ||
| 6178 | * real mode. | ||
| 6179 | */ | ||
| 6180 | return enable_unrestricted_guest || emulate_invalid_guest_state; | ||
| 6181 | case MSR_AMD64_VIRT_SPEC_CTRL: | ||
| 6182 | /* This is AMD only. */ | ||
| 6183 | return false; | ||
| 6184 | default: | ||
| 6185 | return true; | ||
| 6186 | } | ||
| 6187 | } | ||
| 6188 | |||
| 6189 | static bool vmx_pt_supported(void) | ||
| 6190 | { | ||
| 6191 | return pt_mode == PT_MODE_HOST_GUEST; | ||
| 6192 | } | ||
| 6193 | |||
| 6194 | static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) | ||
| 6195 | { | ||
| 6196 | u32 exit_intr_info; | ||
| 6197 | bool unblock_nmi; | ||
| 6198 | u8 vector; | ||
| 6199 | bool idtv_info_valid; | ||
| 6200 | |||
| 6201 | idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK; | ||
| 6202 | |||
| 6203 | if (enable_vnmi) { | ||
| 6204 | if (vmx->loaded_vmcs->nmi_known_unmasked) | ||
| 6205 | return; | ||
| 6206 | /* | ||
| 6207 | * Can't use vmx->exit_intr_info since we're not sure what | ||
| 6208 | * the exit reason is. | ||
| 6209 | */ | ||
| 6210 | exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); | ||
| 6211 | unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0; | ||
| 6212 | vector = exit_intr_info & INTR_INFO_VECTOR_MASK; | ||
| 6213 | /* | ||
| 6214 | * SDM 3: 27.7.1.2 (September 2008) | ||
| 6215 | * Re-set bit "block by NMI" before VM entry if vmexit caused by | ||
| 6216 | * a guest IRET fault. | ||
| 6217 | * SDM 3: 23.2.2 (September 2008) | ||
| 6218 | * Bit 12 is undefined in any of the following cases: | ||
| 6219 | * If the VM exit sets the valid bit in the IDT-vectoring | ||
| 6220 | * information field. | ||
| 6221 | * If the VM exit is due to a double fault. | ||
| 6222 | */ | ||
| 6223 | if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi && | ||
| 6224 | vector != DF_VECTOR && !idtv_info_valid) | ||
| 6225 | vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, | ||
| 6226 | GUEST_INTR_STATE_NMI); | ||
| 6227 | else | ||
| 6228 | vmx->loaded_vmcs->nmi_known_unmasked = | ||
| 6229 | !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) | ||
| 6230 | & GUEST_INTR_STATE_NMI); | ||
| 6231 | } else if (unlikely(vmx->loaded_vmcs->soft_vnmi_blocked)) | ||
| 6232 | vmx->loaded_vmcs->vnmi_blocked_time += | ||
| 6233 | ktime_to_ns(ktime_sub(ktime_get(), | ||
| 6234 | vmx->loaded_vmcs->entry_time)); | ||
| 6235 | } | ||
| 6236 | |||
| 6237 | static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu, | ||
| 6238 | u32 idt_vectoring_info, | ||
| 6239 | int instr_len_field, | ||
| 6240 | int error_code_field) | ||
| 6241 | { | ||
| 6242 | u8 vector; | ||
| 6243 | int type; | ||
| 6244 | bool idtv_info_valid; | ||
| 6245 | |||
| 6246 | idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; | ||
| 6247 | |||
| 6248 | vcpu->arch.nmi_injected = false; | ||
| 6249 | kvm_clear_exception_queue(vcpu); | ||
| 6250 | kvm_clear_interrupt_queue(vcpu); | ||
| 6251 | |||
| 6252 | if (!idtv_info_valid) | ||
| 6253 | return; | ||
| 6254 | |||
| 6255 | kvm_make_request(KVM_REQ_EVENT, vcpu); | ||
| 6256 | |||
| 6257 | vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK; | ||
| 6258 | type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK; | ||
| 6259 | |||
| 6260 | switch (type) { | ||
| 6261 | case INTR_TYPE_NMI_INTR: | ||
| 6262 | vcpu->arch.nmi_injected = true; | ||
| 6263 | /* | ||
| 6264 | * SDM 3: 27.7.1.2 (September 2008) | ||
| 6265 | * Clear bit "block by NMI" before VM entry if a NMI | ||
| 6266 | * delivery faulted. | ||
| 6267 | */ | ||
| 6268 | vmx_set_nmi_mask(vcpu, false); | ||
| 6269 | break; | ||
| 6270 | case INTR_TYPE_SOFT_EXCEPTION: | ||
| 6271 | vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field); | ||
| 6272 | /* fall through */ | ||
| 6273 | case INTR_TYPE_HARD_EXCEPTION: | ||
| 6274 | if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) { | ||
| 6275 | u32 err = vmcs_read32(error_code_field); | ||
| 6276 | kvm_requeue_exception_e(vcpu, vector, err); | ||
| 6277 | } else | ||
| 6278 | kvm_requeue_exception(vcpu, vector); | ||
| 6279 | break; | ||
| 6280 | case INTR_TYPE_SOFT_INTR: | ||
| 6281 | vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field); | ||
| 6282 | /* fall through */ | ||
| 6283 | case INTR_TYPE_EXT_INTR: | ||
| 6284 | kvm_queue_interrupt(vcpu, vector, type == INTR_TYPE_SOFT_INTR); | ||
| 6285 | break; | ||
| 6286 | default: | ||
| 6287 | break; | ||
| 6288 | } | ||
| 6289 | } | ||
| 6290 | |||
| 6291 | static void vmx_complete_interrupts(struct vcpu_vmx *vmx) | ||
| 6292 | { | ||
| 6293 | __vmx_complete_interrupts(&vmx->vcpu, vmx->idt_vectoring_info, | ||
| 6294 | VM_EXIT_INSTRUCTION_LEN, | ||
| 6295 | IDT_VECTORING_ERROR_CODE); | ||
| 6296 | } | ||
| 6297 | |||
| 6298 | static void vmx_cancel_injection(struct kvm_vcpu *vcpu) | ||
| 6299 | { | ||
| 6300 | __vmx_complete_interrupts(vcpu, | ||
| 6301 | vmcs_read32(VM_ENTRY_INTR_INFO_FIELD), | ||
| 6302 | VM_ENTRY_INSTRUCTION_LEN, | ||
| 6303 | VM_ENTRY_EXCEPTION_ERROR_CODE); | ||
| 6304 | |||
| 6305 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); | ||
| 6306 | } | ||
| 6307 | |||
| 6308 | static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) | ||
| 6309 | { | ||
| 6310 | int i, nr_msrs; | ||
| 6311 | struct perf_guest_switch_msr *msrs; | ||
| 6312 | |||
| 6313 | msrs = perf_guest_get_msrs(&nr_msrs); | ||
| 6314 | |||
| 6315 | if (!msrs) | ||
| 6316 | return; | ||
| 6317 | |||
| 6318 | for (i = 0; i < nr_msrs; i++) | ||
| 6319 | if (msrs[i].host == msrs[i].guest) | ||
| 6320 | clear_atomic_switch_msr(vmx, msrs[i].msr); | ||
| 6321 | else | ||
| 6322 | add_atomic_switch_msr(vmx, msrs[i].msr, msrs[i].guest, | ||
| 6323 | msrs[i].host, false); | ||
| 6324 | } | ||
| 6325 | |||
| 6326 | static void vmx_arm_hv_timer(struct vcpu_vmx *vmx, u32 val) | ||
| 6327 | { | ||
| 6328 | vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, val); | ||
| 6329 | if (!vmx->loaded_vmcs->hv_timer_armed) | ||
| 6330 | vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL, | ||
| 6331 | PIN_BASED_VMX_PREEMPTION_TIMER); | ||
| 6332 | vmx->loaded_vmcs->hv_timer_armed = true; | ||
| 6333 | } | ||
| 6334 | |||
| 6335 | static void vmx_update_hv_timer(struct kvm_vcpu *vcpu) | ||
| 6336 | { | ||
| 6337 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 6338 | u64 tscl; | ||
| 6339 | u32 delta_tsc; | ||
| 6340 | |||
| 6341 | if (vmx->req_immediate_exit) { | ||
| 6342 | vmx_arm_hv_timer(vmx, 0); | ||
| 6343 | return; | ||
| 6344 | } | ||
| 6345 | |||
| 6346 | if (vmx->hv_deadline_tsc != -1) { | ||
| 6347 | tscl = rdtsc(); | ||
| 6348 | if (vmx->hv_deadline_tsc > tscl) | ||
| 6349 | /* set_hv_timer ensures the delta fits in 32-bits */ | ||
| 6350 | delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >> | ||
| 6351 | cpu_preemption_timer_multi); | ||
| 6352 | else | ||
| 6353 | delta_tsc = 0; | ||
| 6354 | |||
| 6355 | vmx_arm_hv_timer(vmx, delta_tsc); | ||
| 6356 | return; | ||
| 6357 | } | ||
| 6358 | |||
| 6359 | if (vmx->loaded_vmcs->hv_timer_armed) | ||
| 6360 | vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL, | ||
| 6361 | PIN_BASED_VMX_PREEMPTION_TIMER); | ||
| 6362 | vmx->loaded_vmcs->hv_timer_armed = false; | ||
| 6363 | } | ||
| 6364 | |||
| 6365 | static void vmx_vcpu_run(struct kvm_vcpu *vcpu) | ||
| 6366 | { | ||
| 6367 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 6368 | unsigned long cr3, cr4, evmcs_rsp; | ||
| 6369 | |||
| 6370 | /* Record the guest's net vcpu time for enforced NMI injections. */ | ||
| 6371 | if (unlikely(!enable_vnmi && | ||
| 6372 | vmx->loaded_vmcs->soft_vnmi_blocked)) | ||
| 6373 | vmx->loaded_vmcs->entry_time = ktime_get(); | ||
| 6374 | |||
| 6375 | /* Don't enter VMX if guest state is invalid, let the exit handler | ||
| 6376 | start emulation until we arrive back to a valid state */ | ||
| 6377 | if (vmx->emulation_required) | ||
| 6378 | return; | ||
| 6379 | |||
| 6380 | if (vmx->ple_window_dirty) { | ||
| 6381 | vmx->ple_window_dirty = false; | ||
| 6382 | vmcs_write32(PLE_WINDOW, vmx->ple_window); | ||
| 6383 | } | ||
| 6384 | |||
| 6385 | if (vmx->nested.need_vmcs12_sync) | ||
| 6386 | nested_sync_from_vmcs12(vcpu); | ||
| 6387 | |||
| 6388 | if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty)) | ||
| 6389 | vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); | ||
| 6390 | if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty)) | ||
| 6391 | vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); | ||
| 6392 | |||
| 6393 | cr3 = __get_current_cr3_fast(); | ||
| 6394 | if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) { | ||
| 6395 | vmcs_writel(HOST_CR3, cr3); | ||
| 6396 | vmx->loaded_vmcs->host_state.cr3 = cr3; | ||
| 6397 | } | ||
| 6398 | |||
| 6399 | cr4 = cr4_read_shadow(); | ||
| 6400 | if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) { | ||
| 6401 | vmcs_writel(HOST_CR4, cr4); | ||
| 6402 | vmx->loaded_vmcs->host_state.cr4 = cr4; | ||
| 6403 | } | ||
| 6404 | |||
| 6405 | /* When single-stepping over STI and MOV SS, we must clear the | ||
| 6406 | * corresponding interruptibility bits in the guest state. Otherwise | ||
| 6407 | * vmentry fails as it then expects bit 14 (BS) in pending debug | ||
| 6408 | * exceptions being set, but that's not correct for the guest debugging | ||
| 6409 | * case. */ | ||
| 6410 | if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) | ||
| 6411 | vmx_set_interrupt_shadow(vcpu, 0); | ||
| 6412 | |||
| 6413 | if (static_cpu_has(X86_FEATURE_PKU) && | ||
| 6414 | kvm_read_cr4_bits(vcpu, X86_CR4_PKE) && | ||
| 6415 | vcpu->arch.pkru != vmx->host_pkru) | ||
| 6416 | __write_pkru(vcpu->arch.pkru); | ||
| 6417 | |||
| 6418 | pt_guest_enter(vmx); | ||
| 6419 | |||
| 6420 | atomic_switch_perf_msrs(vmx); | ||
| 6421 | |||
| 6422 | vmx_update_hv_timer(vcpu); | ||
| 6423 | |||
| 6424 | /* | ||
| 6425 | * If this vCPU has touched SPEC_CTRL, restore the guest's value if | ||
| 6426 | * it's non-zero. Since vmentry is serialising on affected CPUs, there | ||
| 6427 | * is no need to worry about the conditional branch over the wrmsr | ||
| 6428 | * being speculatively taken. | ||
| 6429 | */ | ||
| 6430 | x86_spec_ctrl_set_guest(vmx->spec_ctrl, 0); | ||
| 6431 | |||
| 6432 | vmx->__launched = vmx->loaded_vmcs->launched; | ||
| 6433 | |||
| 6434 | evmcs_rsp = static_branch_unlikely(&enable_evmcs) ? | ||
| 6435 | (unsigned long)¤t_evmcs->host_rsp : 0; | ||
| 6436 | |||
| 6437 | if (static_branch_unlikely(&vmx_l1d_should_flush)) | ||
| 6438 | vmx_l1d_flush(vcpu); | ||
| 6439 | |||
| 6440 | asm( | ||
| 6441 | /* Store host registers */ | ||
| 6442 | "push %%" _ASM_DX "; push %%" _ASM_BP ";" | ||
| 6443 | "push %%" _ASM_CX " \n\t" /* placeholder for guest rcx */ | ||
| 6444 | "push %%" _ASM_CX " \n\t" | ||
| 6445 | "sub $%c[wordsize], %%" _ASM_SP "\n\t" /* temporarily adjust RSP for CALL */ | ||
| 6446 | "cmp %%" _ASM_SP ", %c[host_rsp](%%" _ASM_CX ") \n\t" | ||
| 6447 | "je 1f \n\t" | ||
| 6448 | "mov %%" _ASM_SP ", %c[host_rsp](%%" _ASM_CX ") \n\t" | ||
| 6449 | /* Avoid VMWRITE when Enlightened VMCS is in use */ | ||
| 6450 | "test %%" _ASM_SI ", %%" _ASM_SI " \n\t" | ||
| 6451 | "jz 2f \n\t" | ||
| 6452 | "mov %%" _ASM_SP ", (%%" _ASM_SI ") \n\t" | ||
| 6453 | "jmp 1f \n\t" | ||
| 6454 | "2: \n\t" | ||
| 6455 | __ex("vmwrite %%" _ASM_SP ", %%" _ASM_DX) "\n\t" | ||
| 6456 | "1: \n\t" | ||
| 6457 | "add $%c[wordsize], %%" _ASM_SP "\n\t" /* un-adjust RSP */ | ||
| 6458 | |||
| 6459 | /* Reload cr2 if changed */ | ||
| 6460 | "mov %c[cr2](%%" _ASM_CX "), %%" _ASM_AX " \n\t" | ||
| 6461 | "mov %%cr2, %%" _ASM_DX " \n\t" | ||
| 6462 | "cmp %%" _ASM_AX ", %%" _ASM_DX " \n\t" | ||
| 6463 | "je 3f \n\t" | ||
| 6464 | "mov %%" _ASM_AX", %%cr2 \n\t" | ||
| 6465 | "3: \n\t" | ||
| 6466 | /* Check if vmlaunch or vmresume is needed */ | ||
| 6467 | "cmpl $0, %c[launched](%%" _ASM_CX ") \n\t" | ||
| 6468 | /* Load guest registers. Don't clobber flags. */ | ||
| 6469 | "mov %c[rax](%%" _ASM_CX "), %%" _ASM_AX " \n\t" | ||
| 6470 | "mov %c[rbx](%%" _ASM_CX "), %%" _ASM_BX " \n\t" | ||
| 6471 | "mov %c[rdx](%%" _ASM_CX "), %%" _ASM_DX " \n\t" | ||
| 6472 | "mov %c[rsi](%%" _ASM_CX "), %%" _ASM_SI " \n\t" | ||
| 6473 | "mov %c[rdi](%%" _ASM_CX "), %%" _ASM_DI " \n\t" | ||
| 6474 | "mov %c[rbp](%%" _ASM_CX "), %%" _ASM_BP " \n\t" | ||
| 6475 | #ifdef CONFIG_X86_64 | ||
| 6476 | "mov %c[r8](%%" _ASM_CX "), %%r8 \n\t" | ||
| 6477 | "mov %c[r9](%%" _ASM_CX "), %%r9 \n\t" | ||
| 6478 | "mov %c[r10](%%" _ASM_CX "), %%r10 \n\t" | ||
| 6479 | "mov %c[r11](%%" _ASM_CX "), %%r11 \n\t" | ||
| 6480 | "mov %c[r12](%%" _ASM_CX "), %%r12 \n\t" | ||
| 6481 | "mov %c[r13](%%" _ASM_CX "), %%r13 \n\t" | ||
| 6482 | "mov %c[r14](%%" _ASM_CX "), %%r14 \n\t" | ||
| 6483 | "mov %c[r15](%%" _ASM_CX "), %%r15 \n\t" | ||
| 6484 | #endif | ||
| 6485 | /* Load guest RCX. This kills the vmx_vcpu pointer! */ | ||
| 6486 | "mov %c[rcx](%%" _ASM_CX "), %%" _ASM_CX " \n\t" | ||
| 6487 | |||
| 6488 | /* Enter guest mode */ | ||
| 6489 | "call vmx_vmenter\n\t" | ||
| 6490 | |||
| 6491 | /* Save guest's RCX to the stack placeholder (see above) */ | ||
| 6492 | "mov %%" _ASM_CX ", %c[wordsize](%%" _ASM_SP ") \n\t" | ||
| 6493 | |||
| 6494 | /* Load host's RCX, i.e. the vmx_vcpu pointer */ | ||
| 6495 | "pop %%" _ASM_CX " \n\t" | ||
| 6496 | |||
| 6497 | /* Set vmx->fail based on EFLAGS.{CF,ZF} */ | ||
| 6498 | "setbe %c[fail](%%" _ASM_CX ")\n\t" | ||
| 6499 | |||
| 6500 | /* Save all guest registers, including RCX from the stack */ | ||
| 6501 | "mov %%" _ASM_AX ", %c[rax](%%" _ASM_CX ") \n\t" | ||
| 6502 | "mov %%" _ASM_BX ", %c[rbx](%%" _ASM_CX ") \n\t" | ||
| 6503 | __ASM_SIZE(pop) " %c[rcx](%%" _ASM_CX ") \n\t" | ||
| 6504 | "mov %%" _ASM_DX ", %c[rdx](%%" _ASM_CX ") \n\t" | ||
| 6505 | "mov %%" _ASM_SI ", %c[rsi](%%" _ASM_CX ") \n\t" | ||
| 6506 | "mov %%" _ASM_DI ", %c[rdi](%%" _ASM_CX ") \n\t" | ||
| 6507 | "mov %%" _ASM_BP ", %c[rbp](%%" _ASM_CX ") \n\t" | ||
| 6508 | #ifdef CONFIG_X86_64 | ||
| 6509 | "mov %%r8, %c[r8](%%" _ASM_CX ") \n\t" | ||
| 6510 | "mov %%r9, %c[r9](%%" _ASM_CX ") \n\t" | ||
| 6511 | "mov %%r10, %c[r10](%%" _ASM_CX ") \n\t" | ||
| 6512 | "mov %%r11, %c[r11](%%" _ASM_CX ") \n\t" | ||
| 6513 | "mov %%r12, %c[r12](%%" _ASM_CX ") \n\t" | ||
| 6514 | "mov %%r13, %c[r13](%%" _ASM_CX ") \n\t" | ||
| 6515 | "mov %%r14, %c[r14](%%" _ASM_CX ") \n\t" | ||
| 6516 | "mov %%r15, %c[r15](%%" _ASM_CX ") \n\t" | ||
| 6517 | /* | ||
| 6518 | * Clear host registers marked as clobbered to prevent | ||
| 6519 | * speculative use. | ||
| 6520 | */ | ||
| 6521 | "xor %%r8d, %%r8d \n\t" | ||
| 6522 | "xor %%r9d, %%r9d \n\t" | ||
| 6523 | "xor %%r10d, %%r10d \n\t" | ||
| 6524 | "xor %%r11d, %%r11d \n\t" | ||
| 6525 | "xor %%r12d, %%r12d \n\t" | ||
| 6526 | "xor %%r13d, %%r13d \n\t" | ||
| 6527 | "xor %%r14d, %%r14d \n\t" | ||
| 6528 | "xor %%r15d, %%r15d \n\t" | ||
| 6529 | #endif | ||
| 6530 | "mov %%cr2, %%" _ASM_AX " \n\t" | ||
| 6531 | "mov %%" _ASM_AX ", %c[cr2](%%" _ASM_CX ") \n\t" | ||
| 6532 | |||
| 6533 | "xor %%eax, %%eax \n\t" | ||
| 6534 | "xor %%ebx, %%ebx \n\t" | ||
| 6535 | "xor %%esi, %%esi \n\t" | ||
| 6536 | "xor %%edi, %%edi \n\t" | ||
| 6537 | "pop %%" _ASM_BP "; pop %%" _ASM_DX " \n\t" | ||
| 6538 | : ASM_CALL_CONSTRAINT | ||
| 6539 | : "c"(vmx), "d"((unsigned long)HOST_RSP), "S"(evmcs_rsp), | ||
| 6540 | [launched]"i"(offsetof(struct vcpu_vmx, __launched)), | ||
| 6541 | [fail]"i"(offsetof(struct vcpu_vmx, fail)), | ||
| 6542 | [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)), | ||
| 6543 | [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])), | ||
| 6544 | [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])), | ||
| 6545 | [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])), | ||
| 6546 | [rdx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDX])), | ||
| 6547 | [rsi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RSI])), | ||
| 6548 | [rdi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDI])), | ||
| 6549 | [rbp]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBP])), | ||
| 6550 | #ifdef CONFIG_X86_64 | ||
| 6551 | [r8]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R8])), | ||
| 6552 | [r9]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R9])), | ||
| 6553 | [r10]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R10])), | ||
| 6554 | [r11]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R11])), | ||
| 6555 | [r12]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R12])), | ||
| 6556 | [r13]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R13])), | ||
| 6557 | [r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])), | ||
| 6558 | [r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])), | ||
| 6559 | #endif | ||
| 6560 | [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)), | ||
| 6561 | [wordsize]"i"(sizeof(ulong)) | ||
| 6562 | : "cc", "memory" | ||
| 6563 | #ifdef CONFIG_X86_64 | ||
| 6564 | , "rax", "rbx", "rdi" | ||
| 6565 | , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" | ||
| 6566 | #else | ||
| 6567 | , "eax", "ebx", "edi" | ||
| 6568 | #endif | ||
| 6569 | ); | ||
| 6570 | |||
| 6571 | /* | ||
| 6572 | * We do not use IBRS in the kernel. If this vCPU has used the | ||
| 6573 | * SPEC_CTRL MSR it may have left it on; save the value and | ||
| 6574 | * turn it off. This is much more efficient than blindly adding | ||
| 6575 | * it to the atomic save/restore list. Especially as the former | ||
| 6576 | * (Saving guest MSRs on vmexit) doesn't even exist in KVM. | ||
| 6577 | * | ||
| 6578 | * For non-nested case: | ||
| 6579 | * If the L01 MSR bitmap does not intercept the MSR, then we need to | ||
| 6580 | * save it. | ||
| 6581 | * | ||
| 6582 | * For nested case: | ||
| 6583 | * If the L02 MSR bitmap does not intercept the MSR, then we need to | ||
| 6584 | * save it. | ||
| 6585 | */ | ||
| 6586 | if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL))) | ||
| 6587 | vmx->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL); | ||
| 6588 | |||
| 6589 | x86_spec_ctrl_restore_host(vmx->spec_ctrl, 0); | ||
| 6590 | |||
| 6591 | /* Eliminate branch target predictions from guest mode */ | ||
| 6592 | vmexit_fill_RSB(); | ||
| 6593 | |||
| 6594 | /* All fields are clean at this point */ | ||
| 6595 | if (static_branch_unlikely(&enable_evmcs)) | ||
| 6596 | current_evmcs->hv_clean_fields |= | ||
| 6597 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; | ||
| 6598 | |||
| 6599 | /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */ | ||
| 6600 | if (vmx->host_debugctlmsr) | ||
| 6601 | update_debugctlmsr(vmx->host_debugctlmsr); | ||
| 6602 | |||
| 6603 | #ifndef CONFIG_X86_64 | ||
| 6604 | /* | ||
| 6605 | * The sysexit path does not restore ds/es, so we must set them to | ||
| 6606 | * a reasonable value ourselves. | ||
| 6607 | * | ||
| 6608 | * We can't defer this to vmx_prepare_switch_to_host() since that | ||
| 6609 | * function may be executed in interrupt context, which saves and | ||
| 6610 | * restore segments around it, nullifying its effect. | ||
| 6611 | */ | ||
| 6612 | loadsegment(ds, __USER_DS); | ||
| 6613 | loadsegment(es, __USER_DS); | ||
| 6614 | #endif | ||
| 6615 | |||
| 6616 | vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP) | ||
| 6617 | | (1 << VCPU_EXREG_RFLAGS) | ||
| 6618 | | (1 << VCPU_EXREG_PDPTR) | ||
| 6619 | | (1 << VCPU_EXREG_SEGMENTS) | ||
| 6620 | | (1 << VCPU_EXREG_CR3)); | ||
| 6621 | vcpu->arch.regs_dirty = 0; | ||
| 6622 | |||
| 6623 | pt_guest_exit(vmx); | ||
| 6624 | |||
| 6625 | /* | ||
| 6626 | * eager fpu is enabled if PKEY is supported and CR4 is switched | ||
| 6627 | * back on host, so it is safe to read guest PKRU from current | ||
| 6628 | * XSAVE. | ||
| 6629 | */ | ||
| 6630 | if (static_cpu_has(X86_FEATURE_PKU) && | ||
| 6631 | kvm_read_cr4_bits(vcpu, X86_CR4_PKE)) { | ||
| 6632 | vcpu->arch.pkru = __read_pkru(); | ||
| 6633 | if (vcpu->arch.pkru != vmx->host_pkru) | ||
| 6634 | __write_pkru(vmx->host_pkru); | ||
| 6635 | } | ||
| 6636 | |||
| 6637 | vmx->nested.nested_run_pending = 0; | ||
| 6638 | vmx->idt_vectoring_info = 0; | ||
| 6639 | |||
| 6640 | vmx->exit_reason = vmx->fail ? 0xdead : vmcs_read32(VM_EXIT_REASON); | ||
| 6641 | if (vmx->fail || (vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) | ||
| 6642 | return; | ||
| 6643 | |||
| 6644 | vmx->loaded_vmcs->launched = 1; | ||
| 6645 | vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); | ||
| 6646 | |||
| 6647 | vmx_complete_atomic_exit(vmx); | ||
| 6648 | vmx_recover_nmi_blocking(vmx); | ||
| 6649 | vmx_complete_interrupts(vmx); | ||
| 6650 | } | ||
| 6651 | STACK_FRAME_NON_STANDARD(vmx_vcpu_run); | ||
| 6652 | |||
| 6653 | static struct kvm *vmx_vm_alloc(void) | ||
| 6654 | { | ||
| 6655 | struct kvm_vmx *kvm_vmx = vzalloc(sizeof(struct kvm_vmx)); | ||
| 6656 | return &kvm_vmx->kvm; | ||
| 6657 | } | ||
| 6658 | |||
| 6659 | static void vmx_vm_free(struct kvm *kvm) | ||
| 6660 | { | ||
| 6661 | vfree(to_kvm_vmx(kvm)); | ||
| 6662 | } | ||
| 6663 | |||
| 6664 | static void vmx_free_vcpu(struct kvm_vcpu *vcpu) | ||
| 6665 | { | ||
| 6666 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 6667 | |||
| 6668 | if (enable_pml) | ||
| 6669 | vmx_destroy_pml_buffer(vmx); | ||
| 6670 | free_vpid(vmx->vpid); | ||
| 6671 | leave_guest_mode(vcpu); | ||
| 6672 | nested_vmx_free_vcpu(vcpu); | ||
| 6673 | free_loaded_vmcs(vmx->loaded_vmcs); | ||
| 6674 | kfree(vmx->guest_msrs); | ||
| 6675 | kvm_vcpu_uninit(vcpu); | ||
| 6676 | kmem_cache_free(x86_fpu_cache, vmx->vcpu.arch.guest_fpu); | ||
| 6677 | kmem_cache_free(kvm_vcpu_cache, vmx); | ||
| 6678 | } | ||
| 6679 | |||
| 6680 | static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) | ||
| 6681 | { | ||
| 6682 | int err; | ||
| 6683 | struct vcpu_vmx *vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); | ||
| 6684 | unsigned long *msr_bitmap; | ||
| 6685 | int cpu; | ||
| 6686 | |||
| 6687 | if (!vmx) | ||
| 6688 | return ERR_PTR(-ENOMEM); | ||
| 6689 | |||
| 6690 | vmx->vcpu.arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache, GFP_KERNEL); | ||
| 6691 | if (!vmx->vcpu.arch.guest_fpu) { | ||
| 6692 | printk(KERN_ERR "kvm: failed to allocate vcpu's fpu\n"); | ||
| 6693 | err = -ENOMEM; | ||
| 6694 | goto free_partial_vcpu; | ||
| 6695 | } | ||
| 6696 | |||
| 6697 | vmx->vpid = allocate_vpid(); | ||
| 6698 | |||
| 6699 | err = kvm_vcpu_init(&vmx->vcpu, kvm, id); | ||
| 6700 | if (err) | ||
| 6701 | goto free_vcpu; | ||
| 6702 | |||
| 6703 | err = -ENOMEM; | ||
| 6704 | |||
| 6705 | /* | ||
| 6706 | * If PML is turned on, failure on enabling PML just results in failure | ||
| 6707 | * of creating the vcpu, therefore we can simplify PML logic (by | ||
| 6708 | * avoiding dealing with cases, such as enabling PML partially on vcpus | ||
| 6709 | * for the guest, etc. | ||
| 6710 | */ | ||
| 6711 | if (enable_pml) { | ||
| 6712 | vmx->pml_pg = alloc_page(GFP_KERNEL | __GFP_ZERO); | ||
| 6713 | if (!vmx->pml_pg) | ||
| 6714 | goto uninit_vcpu; | ||
| 6715 | } | ||
| 6716 | |||
| 6717 | vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL); | ||
| 6718 | BUILD_BUG_ON(ARRAY_SIZE(vmx_msr_index) * sizeof(vmx->guest_msrs[0]) | ||
| 6719 | > PAGE_SIZE); | ||
| 6720 | |||
| 6721 | if (!vmx->guest_msrs) | ||
| 6722 | goto free_pml; | ||
| 6723 | |||
| 6724 | err = alloc_loaded_vmcs(&vmx->vmcs01); | ||
| 6725 | if (err < 0) | ||
| 6726 | goto free_msrs; | ||
| 6727 | |||
| 6728 | msr_bitmap = vmx->vmcs01.msr_bitmap; | ||
| 6729 | vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_TSC, MSR_TYPE_R); | ||
| 6730 | vmx_disable_intercept_for_msr(msr_bitmap, MSR_FS_BASE, MSR_TYPE_RW); | ||
| 6731 | vmx_disable_intercept_for_msr(msr_bitmap, MSR_GS_BASE, MSR_TYPE_RW); | ||
| 6732 | vmx_disable_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW); | ||
| 6733 | vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW); | ||
| 6734 | vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW); | ||
| 6735 | vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW); | ||
| 6736 | vmx->msr_bitmap_mode = 0; | ||
| 6737 | |||
| 6738 | vmx->loaded_vmcs = &vmx->vmcs01; | ||
| 6739 | cpu = get_cpu(); | ||
| 6740 | vmx_vcpu_load(&vmx->vcpu, cpu); | ||
| 6741 | vmx->vcpu.cpu = cpu; | ||
| 6742 | vmx_vcpu_setup(vmx); | ||
| 6743 | vmx_vcpu_put(&vmx->vcpu); | ||
| 6744 | put_cpu(); | ||
| 6745 | if (cpu_need_virtualize_apic_accesses(&vmx->vcpu)) { | ||
| 6746 | err = alloc_apic_access_page(kvm); | ||
| 6747 | if (err) | ||
| 6748 | goto free_vmcs; | ||
| 6749 | } | ||
| 6750 | |||
| 6751 | if (enable_ept && !enable_unrestricted_guest) { | ||
| 6752 | err = init_rmode_identity_map(kvm); | ||
| 6753 | if (err) | ||
| 6754 | goto free_vmcs; | ||
| 6755 | } | ||
| 6756 | |||
| 6757 | if (nested) | ||
| 6758 | nested_vmx_setup_ctls_msrs(&vmx->nested.msrs, | ||
| 6759 | vmx_capability.ept, | ||
| 6760 | kvm_vcpu_apicv_active(&vmx->vcpu)); | ||
| 6761 | else | ||
| 6762 | memset(&vmx->nested.msrs, 0, sizeof(vmx->nested.msrs)); | ||
| 6763 | |||
| 6764 | vmx->nested.posted_intr_nv = -1; | ||
| 6765 | vmx->nested.current_vmptr = -1ull; | ||
| 6766 | |||
| 6767 | vmx->msr_ia32_feature_control_valid_bits = FEATURE_CONTROL_LOCKED; | ||
| 6768 | |||
| 6769 | /* | ||
| 6770 | * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR | ||
| 6771 | * or POSTED_INTR_WAKEUP_VECTOR. | ||
| 6772 | */ | ||
| 6773 | vmx->pi_desc.nv = POSTED_INTR_VECTOR; | ||
| 6774 | vmx->pi_desc.sn = 1; | ||
| 6775 | |||
| 6776 | vmx->ept_pointer = INVALID_PAGE; | ||
| 6777 | |||
| 6778 | return &vmx->vcpu; | ||
| 6779 | |||
| 6780 | free_vmcs: | ||
| 6781 | free_loaded_vmcs(vmx->loaded_vmcs); | ||
| 6782 | free_msrs: | ||
| 6783 | kfree(vmx->guest_msrs); | ||
| 6784 | free_pml: | ||
| 6785 | vmx_destroy_pml_buffer(vmx); | ||
| 6786 | uninit_vcpu: | ||
| 6787 | kvm_vcpu_uninit(&vmx->vcpu); | ||
| 6788 | free_vcpu: | ||
| 6789 | free_vpid(vmx->vpid); | ||
| 6790 | kmem_cache_free(x86_fpu_cache, vmx->vcpu.arch.guest_fpu); | ||
| 6791 | free_partial_vcpu: | ||
| 6792 | kmem_cache_free(kvm_vcpu_cache, vmx); | ||
| 6793 | return ERR_PTR(err); | ||
| 6794 | } | ||
| 6795 | |||
| 6796 | #define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html for details.\n" | ||
| 6797 | #define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html for details.\n" | ||
| 6798 | |||
| 6799 | static int vmx_vm_init(struct kvm *kvm) | ||
| 6800 | { | ||
| 6801 | spin_lock_init(&to_kvm_vmx(kvm)->ept_pointer_lock); | ||
| 6802 | |||
| 6803 | if (!ple_gap) | ||
| 6804 | kvm->arch.pause_in_guest = true; | ||
| 6805 | |||
| 6806 | if (boot_cpu_has(X86_BUG_L1TF) && enable_ept) { | ||
| 6807 | switch (l1tf_mitigation) { | ||
| 6808 | case L1TF_MITIGATION_OFF: | ||
| 6809 | case L1TF_MITIGATION_FLUSH_NOWARN: | ||
| 6810 | /* 'I explicitly don't care' is set */ | ||
| 6811 | break; | ||
| 6812 | case L1TF_MITIGATION_FLUSH: | ||
| 6813 | case L1TF_MITIGATION_FLUSH_NOSMT: | ||
| 6814 | case L1TF_MITIGATION_FULL: | ||
| 6815 | /* | ||
| 6816 | * Warn upon starting the first VM in a potentially | ||
| 6817 | * insecure environment. | ||
| 6818 | */ | ||
| 6819 | if (cpu_smt_control == CPU_SMT_ENABLED) | ||
| 6820 | pr_warn_once(L1TF_MSG_SMT); | ||
| 6821 | if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER) | ||
| 6822 | pr_warn_once(L1TF_MSG_L1D); | ||
| 6823 | break; | ||
| 6824 | case L1TF_MITIGATION_FULL_FORCE: | ||
| 6825 | /* Flush is enforced */ | ||
| 6826 | break; | ||
| 6827 | } | ||
| 6828 | } | ||
| 6829 | return 0; | ||
| 6830 | } | ||
| 6831 | |||
| 6832 | static void __init vmx_check_processor_compat(void *rtn) | ||
| 6833 | { | ||
| 6834 | struct vmcs_config vmcs_conf; | ||
| 6835 | struct vmx_capability vmx_cap; | ||
| 6836 | |||
| 6837 | *(int *)rtn = 0; | ||
| 6838 | if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0) | ||
| 6839 | *(int *)rtn = -EIO; | ||
| 6840 | if (nested) | ||
| 6841 | nested_vmx_setup_ctls_msrs(&vmcs_conf.nested, vmx_cap.ept, | ||
| 6842 | enable_apicv); | ||
| 6843 | if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) { | ||
| 6844 | printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n", | ||
| 6845 | smp_processor_id()); | ||
| 6846 | *(int *)rtn = -EIO; | ||
| 6847 | } | ||
| 6848 | } | ||
| 6849 | |||
| 6850 | static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) | ||
| 6851 | { | ||
| 6852 | u8 cache; | ||
| 6853 | u64 ipat = 0; | ||
| 6854 | |||
| 6855 | /* For VT-d and EPT combination | ||
| 6856 | * 1. MMIO: always map as UC | ||
| 6857 | * 2. EPT with VT-d: | ||
| 6858 | * a. VT-d without snooping control feature: can't guarantee the | ||
| 6859 | * result, try to trust guest. | ||
| 6860 | * b. VT-d with snooping control feature: snooping control feature of | ||
| 6861 | * VT-d engine can guarantee the cache correctness. Just set it | ||
| 6862 | * to WB to keep consistent with host. So the same as item 3. | ||
| 6863 | * 3. EPT without VT-d: always map as WB and set IPAT=1 to keep | ||
| 6864 | * consistent with host MTRR | ||
| 6865 | */ | ||
| 6866 | if (is_mmio) { | ||
| 6867 | cache = MTRR_TYPE_UNCACHABLE; | ||
| 6868 | goto exit; | ||
| 6869 | } | ||
| 6870 | |||
| 6871 | if (!kvm_arch_has_noncoherent_dma(vcpu->kvm)) { | ||
| 6872 | ipat = VMX_EPT_IPAT_BIT; | ||
| 6873 | cache = MTRR_TYPE_WRBACK; | ||
| 6874 | goto exit; | ||
| 6875 | } | ||
| 6876 | |||
| 6877 | if (kvm_read_cr0(vcpu) & X86_CR0_CD) { | ||
| 6878 | ipat = VMX_EPT_IPAT_BIT; | ||
| 6879 | if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED)) | ||
| 6880 | cache = MTRR_TYPE_WRBACK; | ||
| 6881 | else | ||
| 6882 | cache = MTRR_TYPE_UNCACHABLE; | ||
| 6883 | goto exit; | ||
| 6884 | } | ||
| 6885 | |||
| 6886 | cache = kvm_mtrr_get_guest_memory_type(vcpu, gfn); | ||
| 6887 | |||
| 6888 | exit: | ||
| 6889 | return (cache << VMX_EPT_MT_EPTE_SHIFT) | ipat; | ||
| 6890 | } | ||
| 6891 | |||
| 6892 | static int vmx_get_lpage_level(void) | ||
| 6893 | { | ||
| 6894 | if (enable_ept && !cpu_has_vmx_ept_1g_page()) | ||
| 6895 | return PT_DIRECTORY_LEVEL; | ||
| 6896 | else | ||
| 6897 | /* For shadow and EPT supported 1GB page */ | ||
| 6898 | return PT_PDPE_LEVEL; | ||
| 6899 | } | ||
| 6900 | |||
| 6901 | static void vmcs_set_secondary_exec_control(u32 new_ctl) | ||
| 6902 | { | ||
| 6903 | /* | ||
| 6904 | * These bits in the secondary execution controls field | ||
| 6905 | * are dynamic, the others are mostly based on the hypervisor | ||
| 6906 | * architecture and the guest's CPUID. Do not touch the | ||
| 6907 | * dynamic bits. | ||
| 6908 | */ | ||
| 6909 | u32 mask = | ||
| 6910 | SECONDARY_EXEC_SHADOW_VMCS | | ||
| 6911 | SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | | ||
| 6912 | SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | | ||
| 6913 | SECONDARY_EXEC_DESC; | ||
| 6914 | |||
| 6915 | u32 cur_ctl = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); | ||
| 6916 | |||
| 6917 | vmcs_write32(SECONDARY_VM_EXEC_CONTROL, | ||
| 6918 | (new_ctl & ~mask) | (cur_ctl & mask)); | ||
| 6919 | } | ||
| 6920 | |||
| 6921 | /* | ||
| 6922 | * Generate MSR_IA32_VMX_CR{0,4}_FIXED1 according to CPUID. Only set bits | ||
| 6923 | * (indicating "allowed-1") if they are supported in the guest's CPUID. | ||
| 6924 | */ | ||
| 6925 | static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu) | ||
| 6926 | { | ||
| 6927 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 6928 | struct kvm_cpuid_entry2 *entry; | ||
| 6929 | |||
| 6930 | vmx->nested.msrs.cr0_fixed1 = 0xffffffff; | ||
| 6931 | vmx->nested.msrs.cr4_fixed1 = X86_CR4_PCE; | ||
| 6932 | |||
| 6933 | #define cr4_fixed1_update(_cr4_mask, _reg, _cpuid_mask) do { \ | ||
| 6934 | if (entry && (entry->_reg & (_cpuid_mask))) \ | ||
| 6935 | vmx->nested.msrs.cr4_fixed1 |= (_cr4_mask); \ | ||
| 6936 | } while (0) | ||
| 6937 | |||
| 6938 | entry = kvm_find_cpuid_entry(vcpu, 0x1, 0); | ||
| 6939 | cr4_fixed1_update(X86_CR4_VME, edx, bit(X86_FEATURE_VME)); | ||
| 6940 | cr4_fixed1_update(X86_CR4_PVI, edx, bit(X86_FEATURE_VME)); | ||
| 6941 | cr4_fixed1_update(X86_CR4_TSD, edx, bit(X86_FEATURE_TSC)); | ||
| 6942 | cr4_fixed1_update(X86_CR4_DE, edx, bit(X86_FEATURE_DE)); | ||
| 6943 | cr4_fixed1_update(X86_CR4_PSE, edx, bit(X86_FEATURE_PSE)); | ||
| 6944 | cr4_fixed1_update(X86_CR4_PAE, edx, bit(X86_FEATURE_PAE)); | ||
| 6945 | cr4_fixed1_update(X86_CR4_MCE, edx, bit(X86_FEATURE_MCE)); | ||
| 6946 | cr4_fixed1_update(X86_CR4_PGE, edx, bit(X86_FEATURE_PGE)); | ||
| 6947 | cr4_fixed1_update(X86_CR4_OSFXSR, edx, bit(X86_FEATURE_FXSR)); | ||
| 6948 | cr4_fixed1_update(X86_CR4_OSXMMEXCPT, edx, bit(X86_FEATURE_XMM)); | ||
| 6949 | cr4_fixed1_update(X86_CR4_VMXE, ecx, bit(X86_FEATURE_VMX)); | ||
| 6950 | cr4_fixed1_update(X86_CR4_SMXE, ecx, bit(X86_FEATURE_SMX)); | ||
| 6951 | cr4_fixed1_update(X86_CR4_PCIDE, ecx, bit(X86_FEATURE_PCID)); | ||
| 6952 | cr4_fixed1_update(X86_CR4_OSXSAVE, ecx, bit(X86_FEATURE_XSAVE)); | ||
| 6953 | |||
| 6954 | entry = kvm_find_cpuid_entry(vcpu, 0x7, 0); | ||
| 6955 | cr4_fixed1_update(X86_CR4_FSGSBASE, ebx, bit(X86_FEATURE_FSGSBASE)); | ||
| 6956 | cr4_fixed1_update(X86_CR4_SMEP, ebx, bit(X86_FEATURE_SMEP)); | ||
| 6957 | cr4_fixed1_update(X86_CR4_SMAP, ebx, bit(X86_FEATURE_SMAP)); | ||
| 6958 | cr4_fixed1_update(X86_CR4_PKE, ecx, bit(X86_FEATURE_PKU)); | ||
| 6959 | cr4_fixed1_update(X86_CR4_UMIP, ecx, bit(X86_FEATURE_UMIP)); | ||
| 6960 | |||
| 6961 | #undef cr4_fixed1_update | ||
| 6962 | } | ||
| 6963 | |||
| 6964 | static void nested_vmx_entry_exit_ctls_update(struct kvm_vcpu *vcpu) | ||
| 6965 | { | ||
| 6966 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 6967 | |||
| 6968 | if (kvm_mpx_supported()) { | ||
| 6969 | bool mpx_enabled = guest_cpuid_has(vcpu, X86_FEATURE_MPX); | ||
| 6970 | |||
| 6971 | if (mpx_enabled) { | ||
| 6972 | vmx->nested.msrs.entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS; | ||
| 6973 | vmx->nested.msrs.exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS; | ||
| 6974 | } else { | ||
| 6975 | vmx->nested.msrs.entry_ctls_high &= ~VM_ENTRY_LOAD_BNDCFGS; | ||
| 6976 | vmx->nested.msrs.exit_ctls_high &= ~VM_EXIT_CLEAR_BNDCFGS; | ||
| 6977 | } | ||
| 6978 | } | ||
| 6979 | } | ||
| 6980 | |||
| 6981 | static void update_intel_pt_cfg(struct kvm_vcpu *vcpu) | ||
| 6982 | { | ||
| 6983 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 6984 | struct kvm_cpuid_entry2 *best = NULL; | ||
| 6985 | int i; | ||
| 6986 | |||
| 6987 | for (i = 0; i < PT_CPUID_LEAVES; i++) { | ||
| 6988 | best = kvm_find_cpuid_entry(vcpu, 0x14, i); | ||
| 6989 | if (!best) | ||
| 6990 | return; | ||
| 6991 | vmx->pt_desc.caps[CPUID_EAX + i*PT_CPUID_REGS_NUM] = best->eax; | ||
| 6992 | vmx->pt_desc.caps[CPUID_EBX + i*PT_CPUID_REGS_NUM] = best->ebx; | ||
| 6993 | vmx->pt_desc.caps[CPUID_ECX + i*PT_CPUID_REGS_NUM] = best->ecx; | ||
| 6994 | vmx->pt_desc.caps[CPUID_EDX + i*PT_CPUID_REGS_NUM] = best->edx; | ||
| 6995 | } | ||
| 6996 | |||
| 6997 | /* Get the number of configurable Address Ranges for filtering */ | ||
| 6998 | vmx->pt_desc.addr_range = intel_pt_validate_cap(vmx->pt_desc.caps, | ||
| 6999 | PT_CAP_num_address_ranges); | ||
| 7000 | |||
| 7001 | /* Initialize and clear the no dependency bits */ | ||
| 7002 | vmx->pt_desc.ctl_bitmask = ~(RTIT_CTL_TRACEEN | RTIT_CTL_OS | | ||
| 7003 | RTIT_CTL_USR | RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC); | ||
| 7004 | |||
| 7005 | /* | ||
| 7006 | * If CPUID.(EAX=14H,ECX=0):EBX[0]=1 CR3Filter can be set otherwise | ||
| 7007 | * will inject an #GP | ||
| 7008 | */ | ||
| 7009 | if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_cr3_filtering)) | ||
| 7010 | vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_CR3EN; | ||
| 7011 | |||
| 7012 | /* | ||
| 7013 | * If CPUID.(EAX=14H,ECX=0):EBX[1]=1 CYCEn, CycThresh and | ||
| 7014 | * PSBFreq can be set | ||
| 7015 | */ | ||
| 7016 | if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc)) | ||
| 7017 | vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_CYCLEACC | | ||
| 7018 | RTIT_CTL_CYC_THRESH | RTIT_CTL_PSB_FREQ); | ||
| 7019 | |||
| 7020 | /* | ||
| 7021 | * If CPUID.(EAX=14H,ECX=0):EBX[3]=1 MTCEn BranchEn and | ||
| 7022 | * MTCFreq can be set | ||
| 7023 | */ | ||
| 7024 | if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc)) | ||
| 7025 | vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_MTC_EN | | ||
| 7026 | RTIT_CTL_BRANCH_EN | RTIT_CTL_MTC_RANGE); | ||
| 7027 | |||
| 7028 | /* If CPUID.(EAX=14H,ECX=0):EBX[4]=1 FUPonPTW and PTWEn can be set */ | ||
| 7029 | if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_ptwrite)) | ||
| 7030 | vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_FUP_ON_PTW | | ||
| 7031 | RTIT_CTL_PTW_EN); | ||
| 7032 | |||
| 7033 | /* If CPUID.(EAX=14H,ECX=0):EBX[5]=1 PwrEvEn can be set */ | ||
| 7034 | if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_power_event_trace)) | ||
| 7035 | vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_PWR_EVT_EN; | ||
| 7036 | |||
| 7037 | /* If CPUID.(EAX=14H,ECX=0):ECX[0]=1 ToPA can be set */ | ||
| 7038 | if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_topa_output)) | ||
| 7039 | vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_TOPA; | ||
| 7040 | |||
| 7041 | /* If CPUID.(EAX=14H,ECX=0):ECX[3]=1 FabircEn can be set */ | ||
| 7042 | if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_output_subsys)) | ||
| 7043 | vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_FABRIC_EN; | ||
| 7044 | |||
| 7045 | /* unmask address range configure area */ | ||
| 7046 | for (i = 0; i < vmx->pt_desc.addr_range; i++) | ||
| 7047 | vmx->pt_desc.ctl_bitmask &= ~(0xf << (32 + i * 4)); | ||
| 7048 | } | ||
| 7049 | |||
| 7050 | static void vmx_cpuid_update(struct kvm_vcpu *vcpu) | ||
| 7051 | { | ||
| 7052 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 7053 | |||
| 7054 | if (cpu_has_secondary_exec_ctrls()) { | ||
| 7055 | vmx_compute_secondary_exec_control(vmx); | ||
| 7056 | vmcs_set_secondary_exec_control(vmx->secondary_exec_control); | ||
| 7057 | } | ||
| 7058 | |||
| 7059 | if (nested_vmx_allowed(vcpu)) | ||
| 7060 | to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |= | ||
| 7061 | FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; | ||
| 7062 | else | ||
| 7063 | to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &= | ||
| 7064 | ~FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; | ||
| 7065 | |||
| 7066 | if (nested_vmx_allowed(vcpu)) { | ||
| 7067 | nested_vmx_cr_fixed1_bits_update(vcpu); | ||
| 7068 | nested_vmx_entry_exit_ctls_update(vcpu); | ||
| 7069 | } | ||
| 7070 | |||
| 7071 | if (boot_cpu_has(X86_FEATURE_INTEL_PT) && | ||
| 7072 | guest_cpuid_has(vcpu, X86_FEATURE_INTEL_PT)) | ||
| 7073 | update_intel_pt_cfg(vcpu); | ||
| 7074 | } | ||
| 7075 | |||
| 7076 | static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) | ||
| 7077 | { | ||
| 7078 | if (func == 1 && nested) | ||
| 7079 | entry->ecx |= bit(X86_FEATURE_VMX); | ||
| 7080 | } | ||
| 7081 | |||
| 7082 | static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu) | ||
| 7083 | { | ||
| 7084 | to_vmx(vcpu)->req_immediate_exit = true; | ||
| 7085 | } | ||
| 7086 | |||
| 7087 | static int vmx_check_intercept(struct kvm_vcpu *vcpu, | ||
| 7088 | struct x86_instruction_info *info, | ||
| 7089 | enum x86_intercept_stage stage) | ||
| 7090 | { | ||
| 7091 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); | ||
| 7092 | struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; | ||
| 7093 | |||
| 7094 | /* | ||
| 7095 | * RDPID causes #UD if disabled through secondary execution controls. | ||
| 7096 | * Because it is marked as EmulateOnUD, we need to intercept it here. | ||
| 7097 | */ | ||
| 7098 | if (info->intercept == x86_intercept_rdtscp && | ||
| 7099 | !nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDTSCP)) { | ||
| 7100 | ctxt->exception.vector = UD_VECTOR; | ||
| 7101 | ctxt->exception.error_code_valid = false; | ||
| 7102 | return X86EMUL_PROPAGATE_FAULT; | ||
| 7103 | } | ||
| 7104 | |||
| 7105 | /* TODO: check more intercepts... */ | ||
| 7106 | return X86EMUL_CONTINUE; | ||
| 7107 | } | ||
| 7108 | |||
| 7109 | #ifdef CONFIG_X86_64 | ||
| 7110 | /* (a << shift) / divisor, return 1 if overflow otherwise 0 */ | ||
| 7111 | static inline int u64_shl_div_u64(u64 a, unsigned int shift, | ||
| 7112 | u64 divisor, u64 *result) | ||
| 7113 | { | ||
| 7114 | u64 low = a << shift, high = a >> (64 - shift); | ||
| 7115 | |||
| 7116 | /* To avoid the overflow on divq */ | ||
| 7117 | if (high >= divisor) | ||
| 7118 | return 1; | ||
| 7119 | |||
| 7120 | /* Low hold the result, high hold rem which is discarded */ | ||
| 7121 | asm("divq %2\n\t" : "=a" (low), "=d" (high) : | ||
| 7122 | "rm" (divisor), "0" (low), "1" (high)); | ||
| 7123 | *result = low; | ||
| 7124 | |||
| 7125 | return 0; | ||
| 7126 | } | ||
| 7127 | |||
| 7128 | static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc) | ||
| 7129 | { | ||
| 7130 | struct vcpu_vmx *vmx; | ||
| 7131 | u64 tscl, guest_tscl, delta_tsc, lapic_timer_advance_cycles; | ||
| 7132 | |||
| 7133 | if (kvm_mwait_in_guest(vcpu->kvm)) | ||
| 7134 | return -EOPNOTSUPP; | ||
| 7135 | |||
| 7136 | vmx = to_vmx(vcpu); | ||
| 7137 | tscl = rdtsc(); | ||
| 7138 | guest_tscl = kvm_read_l1_tsc(vcpu, tscl); | ||
| 7139 | delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl; | ||
| 7140 | lapic_timer_advance_cycles = nsec_to_cycles(vcpu, lapic_timer_advance_ns); | ||
| 7141 | |||
| 7142 | if (delta_tsc > lapic_timer_advance_cycles) | ||
| 7143 | delta_tsc -= lapic_timer_advance_cycles; | ||
| 7144 | else | ||
| 7145 | delta_tsc = 0; | ||
| 7146 | |||
| 7147 | /* Convert to host delta tsc if tsc scaling is enabled */ | ||
| 7148 | if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio && | ||
| 7149 | u64_shl_div_u64(delta_tsc, | ||
| 7150 | kvm_tsc_scaling_ratio_frac_bits, | ||
| 7151 | vcpu->arch.tsc_scaling_ratio, | ||
| 7152 | &delta_tsc)) | ||
| 7153 | return -ERANGE; | ||
| 7154 | |||
| 7155 | /* | ||
| 7156 | * If the delta tsc can't fit in the 32 bit after the multi shift, | ||
| 7157 | * we can't use the preemption timer. | ||
| 7158 | * It's possible that it fits on later vmentries, but checking | ||
| 7159 | * on every vmentry is costly so we just use an hrtimer. | ||
| 7160 | */ | ||
| 7161 | if (delta_tsc >> (cpu_preemption_timer_multi + 32)) | ||
| 7162 | return -ERANGE; | ||
| 7163 | |||
| 7164 | vmx->hv_deadline_tsc = tscl + delta_tsc; | ||
| 7165 | return delta_tsc == 0; | ||
| 7166 | } | ||
| 7167 | |||
| 7168 | static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu) | ||
| 7169 | { | ||
| 7170 | to_vmx(vcpu)->hv_deadline_tsc = -1; | ||
| 7171 | } | ||
| 7172 | #endif | ||
| 7173 | |||
| 7174 | static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu) | ||
| 7175 | { | ||
| 7176 | if (!kvm_pause_in_guest(vcpu->kvm)) | ||
| 7177 | shrink_ple_window(vcpu); | ||
| 7178 | } | ||
| 7179 | |||
| 7180 | static void vmx_slot_enable_log_dirty(struct kvm *kvm, | ||
| 7181 | struct kvm_memory_slot *slot) | ||
| 7182 | { | ||
| 7183 | kvm_mmu_slot_leaf_clear_dirty(kvm, slot); | ||
| 7184 | kvm_mmu_slot_largepage_remove_write_access(kvm, slot); | ||
| 7185 | } | ||
| 7186 | |||
| 7187 | static void vmx_slot_disable_log_dirty(struct kvm *kvm, | ||
| 7188 | struct kvm_memory_slot *slot) | ||
| 7189 | { | ||
| 7190 | kvm_mmu_slot_set_dirty(kvm, slot); | ||
| 7191 | } | ||
| 7192 | |||
| 7193 | static void vmx_flush_log_dirty(struct kvm *kvm) | ||
| 7194 | { | ||
| 7195 | kvm_flush_pml_buffers(kvm); | ||
| 7196 | } | ||
| 7197 | |||
| 7198 | static int vmx_write_pml_buffer(struct kvm_vcpu *vcpu) | ||
| 7199 | { | ||
| 7200 | struct vmcs12 *vmcs12; | ||
| 7201 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 7202 | gpa_t gpa; | ||
| 7203 | struct page *page = NULL; | ||
| 7204 | u64 *pml_address; | ||
| 7205 | |||
| 7206 | if (is_guest_mode(vcpu)) { | ||
| 7207 | WARN_ON_ONCE(vmx->nested.pml_full); | ||
| 7208 | |||
| 7209 | /* | ||
| 7210 | * Check if PML is enabled for the nested guest. | ||
| 7211 | * Whether eptp bit 6 is set is already checked | ||
| 7212 | * as part of A/D emulation. | ||
| 7213 | */ | ||
| 7214 | vmcs12 = get_vmcs12(vcpu); | ||
| 7215 | if (!nested_cpu_has_pml(vmcs12)) | ||
| 7216 | return 0; | ||
| 7217 | |||
| 7218 | if (vmcs12->guest_pml_index >= PML_ENTITY_NUM) { | ||
| 7219 | vmx->nested.pml_full = true; | ||
| 7220 | return 1; | ||
| 7221 | } | ||
| 7222 | |||
| 7223 | gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS) & ~0xFFFull; | ||
| 7224 | |||
| 7225 | page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->pml_address); | ||
| 7226 | if (is_error_page(page)) | ||
| 7227 | return 0; | ||
| 7228 | |||
| 7229 | pml_address = kmap(page); | ||
| 7230 | pml_address[vmcs12->guest_pml_index--] = gpa; | ||
| 7231 | kunmap(page); | ||
| 7232 | kvm_release_page_clean(page); | ||
| 7233 | } | ||
| 7234 | |||
| 7235 | return 0; | ||
| 7236 | } | ||
| 7237 | |||
| 7238 | static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm, | ||
| 7239 | struct kvm_memory_slot *memslot, | ||
| 7240 | gfn_t offset, unsigned long mask) | ||
| 7241 | { | ||
| 7242 | kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask); | ||
| 7243 | } | ||
| 7244 | |||
| 7245 | static void __pi_post_block(struct kvm_vcpu *vcpu) | ||
| 7246 | { | ||
| 7247 | struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); | ||
| 7248 | struct pi_desc old, new; | ||
| 7249 | unsigned int dest; | ||
| 7250 | |||
| 7251 | do { | ||
| 7252 | old.control = new.control = pi_desc->control; | ||
| 7253 | WARN(old.nv != POSTED_INTR_WAKEUP_VECTOR, | ||
| 7254 | "Wakeup handler not enabled while the VCPU is blocked\n"); | ||
| 7255 | |||
| 7256 | dest = cpu_physical_id(vcpu->cpu); | ||
| 7257 | |||
| 7258 | if (x2apic_enabled()) | ||
| 7259 | new.ndst = dest; | ||
| 7260 | else | ||
| 7261 | new.ndst = (dest << 8) & 0xFF00; | ||
| 7262 | |||
| 7263 | /* set 'NV' to 'notification vector' */ | ||
| 7264 | new.nv = POSTED_INTR_VECTOR; | ||
| 7265 | } while (cmpxchg64(&pi_desc->control, old.control, | ||
| 7266 | new.control) != old.control); | ||
| 7267 | |||
| 7268 | if (!WARN_ON_ONCE(vcpu->pre_pcpu == -1)) { | ||
| 7269 | spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); | ||
| 7270 | list_del(&vcpu->blocked_vcpu_list); | ||
| 7271 | spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); | ||
| 7272 | vcpu->pre_pcpu = -1; | ||
| 7273 | } | ||
| 7274 | } | ||
| 7275 | |||
| 7276 | /* | ||
| 7277 | * This routine does the following things for vCPU which is going | ||
| 7278 | * to be blocked if VT-d PI is enabled. | ||
| 7279 | * - Store the vCPU to the wakeup list, so when interrupts happen | ||
| 7280 | * we can find the right vCPU to wake up. | ||
| 7281 | * - Change the Posted-interrupt descriptor as below: | ||
| 7282 | * 'NDST' <-- vcpu->pre_pcpu | ||
| 7283 | * 'NV' <-- POSTED_INTR_WAKEUP_VECTOR | ||
| 7284 | * - If 'ON' is set during this process, which means at least one | ||
| 7285 | * interrupt is posted for this vCPU, we cannot block it, in | ||
| 7286 | * this case, return 1, otherwise, return 0. | ||
| 7287 | * | ||
| 7288 | */ | ||
| 7289 | static int pi_pre_block(struct kvm_vcpu *vcpu) | ||
| 7290 | { | ||
| 7291 | unsigned int dest; | ||
| 7292 | struct pi_desc old, new; | ||
| 7293 | struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); | ||
| 7294 | |||
| 7295 | if (!kvm_arch_has_assigned_device(vcpu->kvm) || | ||
| 7296 | !irq_remapping_cap(IRQ_POSTING_CAP) || | ||
| 7297 | !kvm_vcpu_apicv_active(vcpu)) | ||
| 7298 | return 0; | ||
| 7299 | |||
| 7300 | WARN_ON(irqs_disabled()); | ||
| 7301 | local_irq_disable(); | ||
| 7302 | if (!WARN_ON_ONCE(vcpu->pre_pcpu != -1)) { | ||
| 7303 | vcpu->pre_pcpu = vcpu->cpu; | ||
| 7304 | spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); | ||
| 7305 | list_add_tail(&vcpu->blocked_vcpu_list, | ||
| 7306 | &per_cpu(blocked_vcpu_on_cpu, | ||
| 7307 | vcpu->pre_pcpu)); | ||
| 7308 | spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); | ||
| 7309 | } | ||
| 7310 | |||
| 7311 | do { | ||
| 7312 | old.control = new.control = pi_desc->control; | ||
| 7313 | |||
| 7314 | WARN((pi_desc->sn == 1), | ||
| 7315 | "Warning: SN field of posted-interrupts " | ||
| 7316 | "is set before blocking\n"); | ||
| 7317 | |||
| 7318 | /* | ||
| 7319 | * Since vCPU can be preempted during this process, | ||
| 7320 | * vcpu->cpu could be different with pre_pcpu, we | ||
| 7321 | * need to set pre_pcpu as the destination of wakeup | ||
| 7322 | * notification event, then we can find the right vCPU | ||
| 7323 | * to wakeup in wakeup handler if interrupts happen | ||
| 7324 | * when the vCPU is in blocked state. | ||
| 7325 | */ | ||
| 7326 | dest = cpu_physical_id(vcpu->pre_pcpu); | ||
| 7327 | |||
| 7328 | if (x2apic_enabled()) | ||
| 7329 | new.ndst = dest; | ||
| 7330 | else | ||
| 7331 | new.ndst = (dest << 8) & 0xFF00; | ||
| 7332 | |||
| 7333 | /* set 'NV' to 'wakeup vector' */ | ||
| 7334 | new.nv = POSTED_INTR_WAKEUP_VECTOR; | ||
| 7335 | } while (cmpxchg64(&pi_desc->control, old.control, | ||
| 7336 | new.control) != old.control); | ||
| 7337 | |||
| 7338 | /* We should not block the vCPU if an interrupt is posted for it. */ | ||
| 7339 | if (pi_test_on(pi_desc) == 1) | ||
| 7340 | __pi_post_block(vcpu); | ||
| 7341 | |||
| 7342 | local_irq_enable(); | ||
| 7343 | return (vcpu->pre_pcpu == -1); | ||
| 7344 | } | ||
| 7345 | |||
| 7346 | static int vmx_pre_block(struct kvm_vcpu *vcpu) | ||
| 7347 | { | ||
| 7348 | if (pi_pre_block(vcpu)) | ||
| 7349 | return 1; | ||
| 7350 | |||
| 7351 | if (kvm_lapic_hv_timer_in_use(vcpu)) | ||
| 7352 | kvm_lapic_switch_to_sw_timer(vcpu); | ||
| 7353 | |||
| 7354 | return 0; | ||
| 7355 | } | ||
| 7356 | |||
| 7357 | static void pi_post_block(struct kvm_vcpu *vcpu) | ||
| 7358 | { | ||
| 7359 | if (vcpu->pre_pcpu == -1) | ||
| 7360 | return; | ||
| 7361 | |||
| 7362 | WARN_ON(irqs_disabled()); | ||
| 7363 | local_irq_disable(); | ||
| 7364 | __pi_post_block(vcpu); | ||
| 7365 | local_irq_enable(); | ||
| 7366 | } | ||
| 7367 | |||
| 7368 | static void vmx_post_block(struct kvm_vcpu *vcpu) | ||
| 7369 | { | ||
| 7370 | if (kvm_x86_ops->set_hv_timer) | ||
| 7371 | kvm_lapic_switch_to_hv_timer(vcpu); | ||
| 7372 | |||
| 7373 | pi_post_block(vcpu); | ||
| 7374 | } | ||
| 7375 | |||
| 7376 | /* | ||
| 7377 | * vmx_update_pi_irte - set IRTE for Posted-Interrupts | ||
| 7378 | * | ||
| 7379 | * @kvm: kvm | ||
| 7380 | * @host_irq: host irq of the interrupt | ||
| 7381 | * @guest_irq: gsi of the interrupt | ||
| 7382 | * @set: set or unset PI | ||
| 7383 | * returns 0 on success, < 0 on failure | ||
| 7384 | */ | ||
| 7385 | static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq, | ||
| 7386 | uint32_t guest_irq, bool set) | ||
| 7387 | { | ||
| 7388 | struct kvm_kernel_irq_routing_entry *e; | ||
| 7389 | struct kvm_irq_routing_table *irq_rt; | ||
| 7390 | struct kvm_lapic_irq irq; | ||
| 7391 | struct kvm_vcpu *vcpu; | ||
| 7392 | struct vcpu_data vcpu_info; | ||
| 7393 | int idx, ret = 0; | ||
| 7394 | |||
| 7395 | if (!kvm_arch_has_assigned_device(kvm) || | ||
| 7396 | !irq_remapping_cap(IRQ_POSTING_CAP) || | ||
| 7397 | !kvm_vcpu_apicv_active(kvm->vcpus[0])) | ||
| 7398 | return 0; | ||
| 7399 | |||
| 7400 | idx = srcu_read_lock(&kvm->irq_srcu); | ||
| 7401 | irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu); | ||
| 7402 | if (guest_irq >= irq_rt->nr_rt_entries || | ||
| 7403 | hlist_empty(&irq_rt->map[guest_irq])) { | ||
| 7404 | pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n", | ||
| 7405 | guest_irq, irq_rt->nr_rt_entries); | ||
| 7406 | goto out; | ||
| 7407 | } | ||
| 7408 | |||
| 7409 | hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) { | ||
| 7410 | if (e->type != KVM_IRQ_ROUTING_MSI) | ||
| 7411 | continue; | ||
| 7412 | /* | ||
| 7413 | * VT-d PI cannot support posting multicast/broadcast | ||
| 7414 | * interrupts to a vCPU, we still use interrupt remapping | ||
| 7415 | * for these kind of interrupts. | ||
| 7416 | * | ||
| 7417 | * For lowest-priority interrupts, we only support | ||
| 7418 | * those with single CPU as the destination, e.g. user | ||
| 7419 | * configures the interrupts via /proc/irq or uses | ||
| 7420 | * irqbalance to make the interrupts single-CPU. | ||
| 7421 | * | ||
| 7422 | * We will support full lowest-priority interrupt later. | ||
| 7423 | */ | ||
| 7424 | |||
| 7425 | kvm_set_msi_irq(kvm, e, &irq); | ||
| 7426 | if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) { | ||
| 7427 | /* | ||
| 7428 | * Make sure the IRTE is in remapped mode if | ||
| 7429 | * we don't handle it in posted mode. | ||
| 7430 | */ | ||
| 7431 | ret = irq_set_vcpu_affinity(host_irq, NULL); | ||
| 7432 | if (ret < 0) { | ||
| 7433 | printk(KERN_INFO | ||
| 7434 | "failed to back to remapped mode, irq: %u\n", | ||
| 7435 | host_irq); | ||
| 7436 | goto out; | ||
| 7437 | } | ||
| 7438 | |||
| 7439 | continue; | ||
| 7440 | } | ||
| 7441 | |||
| 7442 | vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu)); | ||
| 7443 | vcpu_info.vector = irq.vector; | ||
| 7444 | |||
| 7445 | trace_kvm_pi_irte_update(host_irq, vcpu->vcpu_id, e->gsi, | ||
| 7446 | vcpu_info.vector, vcpu_info.pi_desc_addr, set); | ||
| 7447 | |||
| 7448 | if (set) | ||
| 7449 | ret = irq_set_vcpu_affinity(host_irq, &vcpu_info); | ||
| 7450 | else | ||
| 7451 | ret = irq_set_vcpu_affinity(host_irq, NULL); | ||
| 7452 | |||
| 7453 | if (ret < 0) { | ||
| 7454 | printk(KERN_INFO "%s: failed to update PI IRTE\n", | ||
| 7455 | __func__); | ||
| 7456 | goto out; | ||
| 7457 | } | ||
| 7458 | } | ||
| 7459 | |||
| 7460 | ret = 0; | ||
| 7461 | out: | ||
| 7462 | srcu_read_unlock(&kvm->irq_srcu, idx); | ||
| 7463 | return ret; | ||
| 7464 | } | ||
| 7465 | |||
| 7466 | static void vmx_setup_mce(struct kvm_vcpu *vcpu) | ||
| 7467 | { | ||
| 7468 | if (vcpu->arch.mcg_cap & MCG_LMCE_P) | ||
| 7469 | to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |= | ||
| 7470 | FEATURE_CONTROL_LMCE; | ||
| 7471 | else | ||
| 7472 | to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &= | ||
| 7473 | ~FEATURE_CONTROL_LMCE; | ||
| 7474 | } | ||
| 7475 | |||
| 7476 | static int vmx_smi_allowed(struct kvm_vcpu *vcpu) | ||
| 7477 | { | ||
| 7478 | /* we need a nested vmexit to enter SMM, postpone if run is pending */ | ||
| 7479 | if (to_vmx(vcpu)->nested.nested_run_pending) | ||
| 7480 | return 0; | ||
| 7481 | return 1; | ||
| 7482 | } | ||
| 7483 | |||
| 7484 | static int vmx_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate) | ||
| 7485 | { | ||
| 7486 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 7487 | |||
| 7488 | vmx->nested.smm.guest_mode = is_guest_mode(vcpu); | ||
| 7489 | if (vmx->nested.smm.guest_mode) | ||
| 7490 | nested_vmx_vmexit(vcpu, -1, 0, 0); | ||
| 7491 | |||
| 7492 | vmx->nested.smm.vmxon = vmx->nested.vmxon; | ||
| 7493 | vmx->nested.vmxon = false; | ||
| 7494 | vmx_clear_hlt(vcpu); | ||
| 7495 | return 0; | ||
| 7496 | } | ||
| 7497 | |||
| 7498 | static int vmx_pre_leave_smm(struct kvm_vcpu *vcpu, u64 smbase) | ||
| 7499 | { | ||
| 7500 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 7501 | int ret; | ||
| 7502 | |||
| 7503 | if (vmx->nested.smm.vmxon) { | ||
| 7504 | vmx->nested.vmxon = true; | ||
| 7505 | vmx->nested.smm.vmxon = false; | ||
| 7506 | } | ||
| 7507 | |||
| 7508 | if (vmx->nested.smm.guest_mode) { | ||
| 7509 | vcpu->arch.hflags &= ~HF_SMM_MASK; | ||
| 7510 | ret = nested_vmx_enter_non_root_mode(vcpu, false); | ||
| 7511 | vcpu->arch.hflags |= HF_SMM_MASK; | ||
| 7512 | if (ret) | ||
| 7513 | return ret; | ||
| 7514 | |||
| 7515 | vmx->nested.smm.guest_mode = false; | ||
| 7516 | } | ||
| 7517 | return 0; | ||
| 7518 | } | ||
| 7519 | |||
| 7520 | static int enable_smi_window(struct kvm_vcpu *vcpu) | ||
| 7521 | { | ||
| 7522 | return 0; | ||
| 7523 | } | ||
| 7524 | |||
| 7525 | static __init int hardware_setup(void) | ||
| 7526 | { | ||
| 7527 | unsigned long host_bndcfgs; | ||
| 7528 | int r, i; | ||
| 7529 | |||
| 7530 | rdmsrl_safe(MSR_EFER, &host_efer); | ||
| 7531 | |||
| 7532 | for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) | ||
| 7533 | kvm_define_shared_msr(i, vmx_msr_index[i]); | ||
| 7534 | |||
| 7535 | if (setup_vmcs_config(&vmcs_config, &vmx_capability) < 0) | ||
| 7536 | return -EIO; | ||
| 7537 | |||
| 7538 | if (boot_cpu_has(X86_FEATURE_NX)) | ||
| 7539 | kvm_enable_efer_bits(EFER_NX); | ||
| 7540 | |||
| 7541 | if (boot_cpu_has(X86_FEATURE_MPX)) { | ||
| 7542 | rdmsrl(MSR_IA32_BNDCFGS, host_bndcfgs); | ||
| 7543 | WARN_ONCE(host_bndcfgs, "KVM: BNDCFGS in host will be lost"); | ||
| 7544 | } | ||
| 7545 | |||
| 7546 | if (boot_cpu_has(X86_FEATURE_XSAVES)) | ||
| 7547 | rdmsrl(MSR_IA32_XSS, host_xss); | ||
| 7548 | |||
| 7549 | if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() || | ||
| 7550 | !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global())) | ||
| 7551 | enable_vpid = 0; | ||
| 7552 | |||
| 7553 | if (!cpu_has_vmx_ept() || | ||
| 7554 | !cpu_has_vmx_ept_4levels() || | ||
| 7555 | !cpu_has_vmx_ept_mt_wb() || | ||
| 7556 | !cpu_has_vmx_invept_global()) | ||
| 7557 | enable_ept = 0; | ||
| 7558 | |||
| 7559 | if (!cpu_has_vmx_ept_ad_bits() || !enable_ept) | ||
| 7560 | enable_ept_ad_bits = 0; | ||
| 7561 | |||
| 7562 | if (!cpu_has_vmx_unrestricted_guest() || !enable_ept) | ||
| 7563 | enable_unrestricted_guest = 0; | ||
| 7564 | |||
| 7565 | if (!cpu_has_vmx_flexpriority()) | ||
| 7566 | flexpriority_enabled = 0; | ||
| 7567 | |||
| 7568 | if (!cpu_has_virtual_nmis()) | ||
| 7569 | enable_vnmi = 0; | ||
| 7570 | |||
| 7571 | /* | ||
| 7572 | * set_apic_access_page_addr() is used to reload apic access | ||
| 7573 | * page upon invalidation. No need to do anything if not | ||
| 7574 | * using the APIC_ACCESS_ADDR VMCS field. | ||
| 7575 | */ | ||
| 7576 | if (!flexpriority_enabled) | ||
| 7577 | kvm_x86_ops->set_apic_access_page_addr = NULL; | ||
| 7578 | |||
| 7579 | if (!cpu_has_vmx_tpr_shadow()) | ||
| 7580 | kvm_x86_ops->update_cr8_intercept = NULL; | ||
| 7581 | |||
| 7582 | if (enable_ept && !cpu_has_vmx_ept_2m_page()) | ||
| 7583 | kvm_disable_largepages(); | ||
| 7584 | |||
| 7585 | #if IS_ENABLED(CONFIG_HYPERV) | ||
| 7586 | if (ms_hyperv.nested_features & HV_X64_NESTED_GUEST_MAPPING_FLUSH | ||
| 7587 | && enable_ept) { | ||
| 7588 | kvm_x86_ops->tlb_remote_flush = hv_remote_flush_tlb; | ||
| 7589 | kvm_x86_ops->tlb_remote_flush_with_range = | ||
| 7590 | hv_remote_flush_tlb_with_range; | ||
| 7591 | } | ||
| 7592 | #endif | ||
| 7593 | |||
| 7594 | if (!cpu_has_vmx_ple()) { | ||
| 7595 | ple_gap = 0; | ||
| 7596 | ple_window = 0; | ||
| 7597 | ple_window_grow = 0; | ||
| 7598 | ple_window_max = 0; | ||
| 7599 | ple_window_shrink = 0; | ||
| 7600 | } | ||
| 7601 | |||
| 7602 | if (!cpu_has_vmx_apicv()) { | ||
| 7603 | enable_apicv = 0; | ||
| 7604 | kvm_x86_ops->sync_pir_to_irr = NULL; | ||
| 7605 | } | ||
| 7606 | |||
| 7607 | if (cpu_has_vmx_tsc_scaling()) { | ||
| 7608 | kvm_has_tsc_control = true; | ||
| 7609 | kvm_max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX; | ||
| 7610 | kvm_tsc_scaling_ratio_frac_bits = 48; | ||
| 7611 | } | ||
| 7612 | |||
| 7613 | set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ | ||
| 7614 | |||
| 7615 | if (enable_ept) | ||
| 7616 | vmx_enable_tdp(); | ||
| 7617 | else | ||
| 7618 | kvm_disable_tdp(); | ||
| 7619 | |||
| 7620 | /* | ||
| 7621 | * Only enable PML when hardware supports PML feature, and both EPT | ||
| 7622 | * and EPT A/D bit features are enabled -- PML depends on them to work. | ||
| 7623 | */ | ||
| 7624 | if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml()) | ||
| 7625 | enable_pml = 0; | ||
| 7626 | |||
| 7627 | if (!enable_pml) { | ||
| 7628 | kvm_x86_ops->slot_enable_log_dirty = NULL; | ||
| 7629 | kvm_x86_ops->slot_disable_log_dirty = NULL; | ||
| 7630 | kvm_x86_ops->flush_log_dirty = NULL; | ||
| 7631 | kvm_x86_ops->enable_log_dirty_pt_masked = NULL; | ||
| 7632 | } | ||
| 7633 | |||
| 7634 | if (!cpu_has_vmx_preemption_timer()) | ||
| 7635 | kvm_x86_ops->request_immediate_exit = __kvm_request_immediate_exit; | ||
| 7636 | |||
| 7637 | if (cpu_has_vmx_preemption_timer() && enable_preemption_timer) { | ||
| 7638 | u64 vmx_msr; | ||
| 7639 | |||
| 7640 | rdmsrl(MSR_IA32_VMX_MISC, vmx_msr); | ||
| 7641 | cpu_preemption_timer_multi = | ||
| 7642 | vmx_msr & VMX_MISC_PREEMPTION_TIMER_RATE_MASK; | ||
| 7643 | } else { | ||
| 7644 | kvm_x86_ops->set_hv_timer = NULL; | ||
| 7645 | kvm_x86_ops->cancel_hv_timer = NULL; | ||
| 7646 | } | ||
| 7647 | |||
| 7648 | kvm_set_posted_intr_wakeup_handler(wakeup_handler); | ||
| 7649 | |||
| 7650 | kvm_mce_cap_supported |= MCG_LMCE_P; | ||
| 7651 | |||
| 7652 | if (pt_mode != PT_MODE_SYSTEM && pt_mode != PT_MODE_HOST_GUEST) | ||
| 7653 | return -EINVAL; | ||
| 7654 | if (!enable_ept || !cpu_has_vmx_intel_pt()) | ||
| 7655 | pt_mode = PT_MODE_SYSTEM; | ||
| 7656 | |||
| 7657 | if (nested) { | ||
| 7658 | nested_vmx_setup_ctls_msrs(&vmcs_config.nested, | ||
| 7659 | vmx_capability.ept, enable_apicv); | ||
| 7660 | |||
| 7661 | r = nested_vmx_hardware_setup(kvm_vmx_exit_handlers); | ||
| 7662 | if (r) | ||
| 7663 | return r; | ||
| 7664 | } | ||
| 7665 | |||
| 7666 | r = alloc_kvm_area(); | ||
| 7667 | if (r) | ||
| 7668 | nested_vmx_hardware_unsetup(); | ||
| 7669 | return r; | ||
| 7670 | } | ||
| 7671 | |||
| 7672 | static __exit void hardware_unsetup(void) | ||
| 7673 | { | ||
| 7674 | if (nested) | ||
| 7675 | nested_vmx_hardware_unsetup(); | ||
| 7676 | |||
| 7677 | free_kvm_area(); | ||
| 7678 | } | ||
| 7679 | |||
| 7680 | static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { | ||
| 7681 | .cpu_has_kvm_support = cpu_has_kvm_support, | ||
| 7682 | .disabled_by_bios = vmx_disabled_by_bios, | ||
| 7683 | .hardware_setup = hardware_setup, | ||
| 7684 | .hardware_unsetup = hardware_unsetup, | ||
| 7685 | .check_processor_compatibility = vmx_check_processor_compat, | ||
| 7686 | .hardware_enable = hardware_enable, | ||
| 7687 | .hardware_disable = hardware_disable, | ||
| 7688 | .cpu_has_accelerated_tpr = report_flexpriority, | ||
| 7689 | .has_emulated_msr = vmx_has_emulated_msr, | ||
| 7690 | |||
| 7691 | .vm_init = vmx_vm_init, | ||
| 7692 | .vm_alloc = vmx_vm_alloc, | ||
| 7693 | .vm_free = vmx_vm_free, | ||
| 7694 | |||
| 7695 | .vcpu_create = vmx_create_vcpu, | ||
| 7696 | .vcpu_free = vmx_free_vcpu, | ||
| 7697 | .vcpu_reset = vmx_vcpu_reset, | ||
| 7698 | |||
| 7699 | .prepare_guest_switch = vmx_prepare_switch_to_guest, | ||
| 7700 | .vcpu_load = vmx_vcpu_load, | ||
| 7701 | .vcpu_put = vmx_vcpu_put, | ||
| 7702 | |||
| 7703 | .update_bp_intercept = update_exception_bitmap, | ||
| 7704 | .get_msr_feature = vmx_get_msr_feature, | ||
| 7705 | .get_msr = vmx_get_msr, | ||
| 7706 | .set_msr = vmx_set_msr, | ||
| 7707 | .get_segment_base = vmx_get_segment_base, | ||
| 7708 | .get_segment = vmx_get_segment, | ||
| 7709 | .set_segment = vmx_set_segment, | ||
| 7710 | .get_cpl = vmx_get_cpl, | ||
| 7711 | .get_cs_db_l_bits = vmx_get_cs_db_l_bits, | ||
| 7712 | .decache_cr0_guest_bits = vmx_decache_cr0_guest_bits, | ||
| 7713 | .decache_cr3 = vmx_decache_cr3, | ||
| 7714 | .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits, | ||
| 7715 | .set_cr0 = vmx_set_cr0, | ||
| 7716 | .set_cr3 = vmx_set_cr3, | ||
| 7717 | .set_cr4 = vmx_set_cr4, | ||
| 7718 | .set_efer = vmx_set_efer, | ||
| 7719 | .get_idt = vmx_get_idt, | ||
| 7720 | .set_idt = vmx_set_idt, | ||
| 7721 | .get_gdt = vmx_get_gdt, | ||
| 7722 | .set_gdt = vmx_set_gdt, | ||
| 7723 | .get_dr6 = vmx_get_dr6, | ||
| 7724 | .set_dr6 = vmx_set_dr6, | ||
| 7725 | .set_dr7 = vmx_set_dr7, | ||
| 7726 | .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs, | ||
| 7727 | .cache_reg = vmx_cache_reg, | ||
| 7728 | .get_rflags = vmx_get_rflags, | ||
| 7729 | .set_rflags = vmx_set_rflags, | ||
| 7730 | |||
| 7731 | .tlb_flush = vmx_flush_tlb, | ||
| 7732 | .tlb_flush_gva = vmx_flush_tlb_gva, | ||
| 7733 | |||
| 7734 | .run = vmx_vcpu_run, | ||
| 7735 | .handle_exit = vmx_handle_exit, | ||
| 7736 | .skip_emulated_instruction = skip_emulated_instruction, | ||
| 7737 | .set_interrupt_shadow = vmx_set_interrupt_shadow, | ||
| 7738 | .get_interrupt_shadow = vmx_get_interrupt_shadow, | ||
| 7739 | .patch_hypercall = vmx_patch_hypercall, | ||
| 7740 | .set_irq = vmx_inject_irq, | ||
| 7741 | .set_nmi = vmx_inject_nmi, | ||
| 7742 | .queue_exception = vmx_queue_exception, | ||
| 7743 | .cancel_injection = vmx_cancel_injection, | ||
| 7744 | .interrupt_allowed = vmx_interrupt_allowed, | ||
| 7745 | .nmi_allowed = vmx_nmi_allowed, | ||
| 7746 | .get_nmi_mask = vmx_get_nmi_mask, | ||
| 7747 | .set_nmi_mask = vmx_set_nmi_mask, | ||
| 7748 | .enable_nmi_window = enable_nmi_window, | ||
| 7749 | .enable_irq_window = enable_irq_window, | ||
| 7750 | .update_cr8_intercept = update_cr8_intercept, | ||
| 7751 | .set_virtual_apic_mode = vmx_set_virtual_apic_mode, | ||
| 7752 | .set_apic_access_page_addr = vmx_set_apic_access_page_addr, | ||
| 7753 | .get_enable_apicv = vmx_get_enable_apicv, | ||
| 7754 | .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl, | ||
| 7755 | .load_eoi_exitmap = vmx_load_eoi_exitmap, | ||
| 7756 | .apicv_post_state_restore = vmx_apicv_post_state_restore, | ||
| 7757 | .hwapic_irr_update = vmx_hwapic_irr_update, | ||
| 7758 | .hwapic_isr_update = vmx_hwapic_isr_update, | ||
| 7759 | .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt, | ||
| 7760 | .sync_pir_to_irr = vmx_sync_pir_to_irr, | ||
| 7761 | .deliver_posted_interrupt = vmx_deliver_posted_interrupt, | ||
| 7762 | |||
| 7763 | .set_tss_addr = vmx_set_tss_addr, | ||
| 7764 | .set_identity_map_addr = vmx_set_identity_map_addr, | ||
| 7765 | .get_tdp_level = get_ept_level, | ||
| 7766 | .get_mt_mask = vmx_get_mt_mask, | ||
| 7767 | |||
| 7768 | .get_exit_info = vmx_get_exit_info, | ||
| 7769 | |||
| 7770 | .get_lpage_level = vmx_get_lpage_level, | ||
| 7771 | |||
| 7772 | .cpuid_update = vmx_cpuid_update, | ||
| 7773 | |||
| 7774 | .rdtscp_supported = vmx_rdtscp_supported, | ||
| 7775 | .invpcid_supported = vmx_invpcid_supported, | ||
| 7776 | |||
| 7777 | .set_supported_cpuid = vmx_set_supported_cpuid, | ||
| 7778 | |||
| 7779 | .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, | ||
| 7780 | |||
| 7781 | .read_l1_tsc_offset = vmx_read_l1_tsc_offset, | ||
| 7782 | .write_l1_tsc_offset = vmx_write_l1_tsc_offset, | ||
| 7783 | |||
| 7784 | .set_tdp_cr3 = vmx_set_cr3, | ||
| 7785 | |||
| 7786 | .check_intercept = vmx_check_intercept, | ||
| 7787 | .handle_external_intr = vmx_handle_external_intr, | ||
| 7788 | .mpx_supported = vmx_mpx_supported, | ||
| 7789 | .xsaves_supported = vmx_xsaves_supported, | ||
| 7790 | .umip_emulated = vmx_umip_emulated, | ||
| 7791 | .pt_supported = vmx_pt_supported, | ||
| 7792 | |||
| 7793 | .request_immediate_exit = vmx_request_immediate_exit, | ||
| 7794 | |||
| 7795 | .sched_in = vmx_sched_in, | ||
| 7796 | |||
| 7797 | .slot_enable_log_dirty = vmx_slot_enable_log_dirty, | ||
| 7798 | .slot_disable_log_dirty = vmx_slot_disable_log_dirty, | ||
| 7799 | .flush_log_dirty = vmx_flush_log_dirty, | ||
| 7800 | .enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked, | ||
| 7801 | .write_log_dirty = vmx_write_pml_buffer, | ||
| 7802 | |||
| 7803 | .pre_block = vmx_pre_block, | ||
| 7804 | .post_block = vmx_post_block, | ||
| 7805 | |||
| 7806 | .pmu_ops = &intel_pmu_ops, | ||
| 7807 | |||
| 7808 | .update_pi_irte = vmx_update_pi_irte, | ||
| 7809 | |||
| 7810 | #ifdef CONFIG_X86_64 | ||
| 7811 | .set_hv_timer = vmx_set_hv_timer, | ||
| 7812 | .cancel_hv_timer = vmx_cancel_hv_timer, | ||
| 7813 | #endif | ||
| 7814 | |||
| 7815 | .setup_mce = vmx_setup_mce, | ||
| 7816 | |||
| 7817 | .smi_allowed = vmx_smi_allowed, | ||
| 7818 | .pre_enter_smm = vmx_pre_enter_smm, | ||
| 7819 | .pre_leave_smm = vmx_pre_leave_smm, | ||
| 7820 | .enable_smi_window = enable_smi_window, | ||
| 7821 | |||
| 7822 | .check_nested_events = NULL, | ||
| 7823 | .get_nested_state = NULL, | ||
| 7824 | .set_nested_state = NULL, | ||
| 7825 | .get_vmcs12_pages = NULL, | ||
| 7826 | .nested_enable_evmcs = NULL, | ||
| 7827 | }; | ||
| 7828 | |||
| 7829 | static void vmx_cleanup_l1d_flush(void) | ||
| 7830 | { | ||
| 7831 | if (vmx_l1d_flush_pages) { | ||
| 7832 | free_pages((unsigned long)vmx_l1d_flush_pages, L1D_CACHE_ORDER); | ||
| 7833 | vmx_l1d_flush_pages = NULL; | ||
| 7834 | } | ||
| 7835 | /* Restore state so sysfs ignores VMX */ | ||
| 7836 | l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO; | ||
| 7837 | } | ||
| 7838 | |||
| 7839 | static void vmx_exit(void) | ||
| 7840 | { | ||
| 7841 | #ifdef CONFIG_KEXEC_CORE | ||
| 7842 | RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL); | ||
| 7843 | synchronize_rcu(); | ||
| 7844 | #endif | ||
| 7845 | |||
| 7846 | kvm_exit(); | ||
| 7847 | |||
| 7848 | #if IS_ENABLED(CONFIG_HYPERV) | ||
| 7849 | if (static_branch_unlikely(&enable_evmcs)) { | ||
| 7850 | int cpu; | ||
| 7851 | struct hv_vp_assist_page *vp_ap; | ||
| 7852 | /* | ||
| 7853 | * Reset everything to support using non-enlightened VMCS | ||
| 7854 | * access later (e.g. when we reload the module with | ||
| 7855 | * enlightened_vmcs=0) | ||
| 7856 | */ | ||
| 7857 | for_each_online_cpu(cpu) { | ||
| 7858 | vp_ap = hv_get_vp_assist_page(cpu); | ||
| 7859 | |||
| 7860 | if (!vp_ap) | ||
| 7861 | continue; | ||
| 7862 | |||
| 7863 | vp_ap->current_nested_vmcs = 0; | ||
| 7864 | vp_ap->enlighten_vmentry = 0; | ||
| 7865 | } | ||
| 7866 | |||
| 7867 | static_branch_disable(&enable_evmcs); | ||
| 7868 | } | ||
| 7869 | #endif | ||
| 7870 | vmx_cleanup_l1d_flush(); | ||
| 7871 | } | ||
| 7872 | module_exit(vmx_exit); | ||
| 7873 | |||
| 7874 | static int __init vmx_init(void) | ||
| 7875 | { | ||
| 7876 | int r; | ||
| 7877 | |||
| 7878 | #if IS_ENABLED(CONFIG_HYPERV) | ||
| 7879 | /* | ||
| 7880 | * Enlightened VMCS usage should be recommended and the host needs | ||
| 7881 | * to support eVMCS v1 or above. We can also disable eVMCS support | ||
| 7882 | * with module parameter. | ||
| 7883 | */ | ||
| 7884 | if (enlightened_vmcs && | ||
| 7885 | ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED && | ||
| 7886 | (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >= | ||
| 7887 | KVM_EVMCS_VERSION) { | ||
| 7888 | int cpu; | ||
| 7889 | |||
| 7890 | /* Check that we have assist pages on all online CPUs */ | ||
| 7891 | for_each_online_cpu(cpu) { | ||
| 7892 | if (!hv_get_vp_assist_page(cpu)) { | ||
| 7893 | enlightened_vmcs = false; | ||
| 7894 | break; | ||
| 7895 | } | ||
| 7896 | } | ||
| 7897 | |||
| 7898 | if (enlightened_vmcs) { | ||
| 7899 | pr_info("KVM: vmx: using Hyper-V Enlightened VMCS\n"); | ||
| 7900 | static_branch_enable(&enable_evmcs); | ||
| 7901 | } | ||
| 7902 | } else { | ||
| 7903 | enlightened_vmcs = false; | ||
| 7904 | } | ||
| 7905 | #endif | ||
| 7906 | |||
| 7907 | r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), | ||
| 7908 | __alignof__(struct vcpu_vmx), THIS_MODULE); | ||
| 7909 | if (r) | ||
| 7910 | return r; | ||
| 7911 | |||
| 7912 | /* | ||
| 7913 | * Must be called after kvm_init() so enable_ept is properly set | ||
| 7914 | * up. Hand the parameter mitigation value in which was stored in | ||
| 7915 | * the pre module init parser. If no parameter was given, it will | ||
| 7916 | * contain 'auto' which will be turned into the default 'cond' | ||
| 7917 | * mitigation mode. | ||
| 7918 | */ | ||
| 7919 | if (boot_cpu_has(X86_BUG_L1TF)) { | ||
| 7920 | r = vmx_setup_l1d_flush(vmentry_l1d_flush_param); | ||
| 7921 | if (r) { | ||
| 7922 | vmx_exit(); | ||
| 7923 | return r; | ||
| 7924 | } | ||
| 7925 | } | ||
| 7926 | |||
| 7927 | #ifdef CONFIG_KEXEC_CORE | ||
| 7928 | rcu_assign_pointer(crash_vmclear_loaded_vmcss, | ||
| 7929 | crash_vmclear_local_loaded_vmcss); | ||
| 7930 | #endif | ||
| 7931 | vmx_check_vmcs12_offsets(); | ||
| 7932 | |||
| 7933 | return 0; | ||
| 7934 | } | ||
| 7935 | module_init(vmx_init); | ||
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h new file mode 100644 index 000000000000..99328954c2fc --- /dev/null +++ b/arch/x86/kvm/vmx/vmx.h | |||
| @@ -0,0 +1,519 @@ | |||
| 1 | /* SPDX-License-Identifier: GPL-2.0 */ | ||
| 2 | #ifndef __KVM_X86_VMX_H | ||
| 3 | #define __KVM_X86_VMX_H | ||
| 4 | |||
| 5 | #include <linux/kvm_host.h> | ||
| 6 | |||
| 7 | #include <asm/kvm.h> | ||
| 8 | #include <asm/intel_pt.h> | ||
| 9 | |||
| 10 | #include "capabilities.h" | ||
| 11 | #include "ops.h" | ||
| 12 | #include "vmcs.h" | ||
| 13 | |||
| 14 | extern const u32 vmx_msr_index[]; | ||
| 15 | extern u64 host_efer; | ||
| 16 | |||
| 17 | #define MSR_TYPE_R 1 | ||
| 18 | #define MSR_TYPE_W 2 | ||
| 19 | #define MSR_TYPE_RW 3 | ||
| 20 | |||
| 21 | #define X2APIC_MSR(r) (APIC_BASE_MSR + ((r) >> 4)) | ||
| 22 | |||
| 23 | #define NR_AUTOLOAD_MSRS 8 | ||
| 24 | |||
| 25 | struct vmx_msrs { | ||
| 26 | unsigned int nr; | ||
| 27 | struct vmx_msr_entry val[NR_AUTOLOAD_MSRS]; | ||
| 28 | }; | ||
| 29 | |||
| 30 | struct shared_msr_entry { | ||
| 31 | unsigned index; | ||
| 32 | u64 data; | ||
| 33 | u64 mask; | ||
| 34 | }; | ||
| 35 | |||
| 36 | enum segment_cache_field { | ||
| 37 | SEG_FIELD_SEL = 0, | ||
| 38 | SEG_FIELD_BASE = 1, | ||
| 39 | SEG_FIELD_LIMIT = 2, | ||
| 40 | SEG_FIELD_AR = 3, | ||
| 41 | |||
| 42 | SEG_FIELD_NR = 4 | ||
| 43 | }; | ||
| 44 | |||
| 45 | /* Posted-Interrupt Descriptor */ | ||
| 46 | struct pi_desc { | ||
| 47 | u32 pir[8]; /* Posted interrupt requested */ | ||
| 48 | union { | ||
| 49 | struct { | ||
| 50 | /* bit 256 - Outstanding Notification */ | ||
| 51 | u16 on : 1, | ||
| 52 | /* bit 257 - Suppress Notification */ | ||
| 53 | sn : 1, | ||
| 54 | /* bit 271:258 - Reserved */ | ||
| 55 | rsvd_1 : 14; | ||
| 56 | /* bit 279:272 - Notification Vector */ | ||
| 57 | u8 nv; | ||
| 58 | /* bit 287:280 - Reserved */ | ||
| 59 | u8 rsvd_2; | ||
| 60 | /* bit 319:288 - Notification Destination */ | ||
| 61 | u32 ndst; | ||
| 62 | }; | ||
| 63 | u64 control; | ||
| 64 | }; | ||
| 65 | u32 rsvd[6]; | ||
| 66 | } __aligned(64); | ||
| 67 | |||
| 68 | #define RTIT_ADDR_RANGE 4 | ||
| 69 | |||
| 70 | struct pt_ctx { | ||
| 71 | u64 ctl; | ||
| 72 | u64 status; | ||
| 73 | u64 output_base; | ||
| 74 | u64 output_mask; | ||
| 75 | u64 cr3_match; | ||
| 76 | u64 addr_a[RTIT_ADDR_RANGE]; | ||
| 77 | u64 addr_b[RTIT_ADDR_RANGE]; | ||
| 78 | }; | ||
| 79 | |||
| 80 | struct pt_desc { | ||
| 81 | u64 ctl_bitmask; | ||
| 82 | u32 addr_range; | ||
| 83 | u32 caps[PT_CPUID_REGS_NUM * PT_CPUID_LEAVES]; | ||
| 84 | struct pt_ctx host; | ||
| 85 | struct pt_ctx guest; | ||
| 86 | }; | ||
| 87 | |||
| 88 | /* | ||
| 89 | * The nested_vmx structure is part of vcpu_vmx, and holds information we need | ||
| 90 | * for correct emulation of VMX (i.e., nested VMX) on this vcpu. | ||
| 91 | */ | ||
| 92 | struct nested_vmx { | ||
| 93 | /* Has the level1 guest done vmxon? */ | ||
| 94 | bool vmxon; | ||
| 95 | gpa_t vmxon_ptr; | ||
| 96 | bool pml_full; | ||
| 97 | |||
| 98 | /* The guest-physical address of the current VMCS L1 keeps for L2 */ | ||
| 99 | gpa_t current_vmptr; | ||
| 100 | /* | ||
| 101 | * Cache of the guest's VMCS, existing outside of guest memory. | ||
| 102 | * Loaded from guest memory during VMPTRLD. Flushed to guest | ||
| 103 | * memory during VMCLEAR and VMPTRLD. | ||
| 104 | */ | ||
| 105 | struct vmcs12 *cached_vmcs12; | ||
| 106 | /* | ||
| 107 | * Cache of the guest's shadow VMCS, existing outside of guest | ||
| 108 | * memory. Loaded from guest memory during VM entry. Flushed | ||
| 109 | * to guest memory during VM exit. | ||
| 110 | */ | ||
| 111 | struct vmcs12 *cached_shadow_vmcs12; | ||
| 112 | /* | ||
| 113 | * Indicates if the shadow vmcs or enlightened vmcs must be updated | ||
| 114 | * with the data held by struct vmcs12. | ||
| 115 | */ | ||
| 116 | bool need_vmcs12_sync; | ||
| 117 | bool dirty_vmcs12; | ||
| 118 | |||
| 119 | /* | ||
| 120 | * vmcs02 has been initialized, i.e. state that is constant for | ||
| 121 | * vmcs02 has been written to the backing VMCS. Initialization | ||
| 122 | * is delayed until L1 actually attempts to run a nested VM. | ||
| 123 | */ | ||
| 124 | bool vmcs02_initialized; | ||
| 125 | |||
| 126 | bool change_vmcs01_virtual_apic_mode; | ||
| 127 | |||
| 128 | /* | ||
| 129 | * Enlightened VMCS has been enabled. It does not mean that L1 has to | ||
| 130 | * use it. However, VMX features available to L1 will be limited based | ||
| 131 | * on what the enlightened VMCS supports. | ||
| 132 | */ | ||
| 133 | bool enlightened_vmcs_enabled; | ||
| 134 | |||
| 135 | /* L2 must run next, and mustn't decide to exit to L1. */ | ||
| 136 | bool nested_run_pending; | ||
| 137 | |||
| 138 | struct loaded_vmcs vmcs02; | ||
| 139 | |||
| 140 | /* | ||
| 141 | * Guest pages referred to in the vmcs02 with host-physical | ||
| 142 | * pointers, so we must keep them pinned while L2 runs. | ||
| 143 | */ | ||
| 144 | struct page *apic_access_page; | ||
| 145 | struct page *virtual_apic_page; | ||
| 146 | struct page *pi_desc_page; | ||
| 147 | struct pi_desc *pi_desc; | ||
| 148 | bool pi_pending; | ||
| 149 | u16 posted_intr_nv; | ||
| 150 | |||
| 151 | struct hrtimer preemption_timer; | ||
| 152 | bool preemption_timer_expired; | ||
| 153 | |||
| 154 | /* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */ | ||
| 155 | u64 vmcs01_debugctl; | ||
| 156 | u64 vmcs01_guest_bndcfgs; | ||
| 157 | |||
| 158 | u16 vpid02; | ||
| 159 | u16 last_vpid; | ||
| 160 | |||
| 161 | struct nested_vmx_msrs msrs; | ||
| 162 | |||
| 163 | /* SMM related state */ | ||
| 164 | struct { | ||
| 165 | /* in VMX operation on SMM entry? */ | ||
| 166 | bool vmxon; | ||
| 167 | /* in guest mode on SMM entry? */ | ||
| 168 | bool guest_mode; | ||
| 169 | } smm; | ||
| 170 | |||
| 171 | gpa_t hv_evmcs_vmptr; | ||
| 172 | struct page *hv_evmcs_page; | ||
| 173 | struct hv_enlightened_vmcs *hv_evmcs; | ||
| 174 | }; | ||
| 175 | |||
| 176 | struct vcpu_vmx { | ||
| 177 | struct kvm_vcpu vcpu; | ||
| 178 | unsigned long host_rsp; | ||
| 179 | u8 fail; | ||
| 180 | u8 msr_bitmap_mode; | ||
| 181 | u32 exit_intr_info; | ||
| 182 | u32 idt_vectoring_info; | ||
| 183 | ulong rflags; | ||
| 184 | struct shared_msr_entry *guest_msrs; | ||
| 185 | int nmsrs; | ||
| 186 | int save_nmsrs; | ||
| 187 | bool guest_msrs_dirty; | ||
| 188 | unsigned long host_idt_base; | ||
| 189 | #ifdef CONFIG_X86_64 | ||
| 190 | u64 msr_host_kernel_gs_base; | ||
| 191 | u64 msr_guest_kernel_gs_base; | ||
| 192 | #endif | ||
| 193 | |||
| 194 | u64 arch_capabilities; | ||
| 195 | u64 spec_ctrl; | ||
| 196 | |||
| 197 | u32 vm_entry_controls_shadow; | ||
| 198 | u32 vm_exit_controls_shadow; | ||
| 199 | u32 secondary_exec_control; | ||
| 200 | |||
| 201 | /* | ||
| 202 | * loaded_vmcs points to the VMCS currently used in this vcpu. For a | ||
| 203 | * non-nested (L1) guest, it always points to vmcs01. For a nested | ||
| 204 | * guest (L2), it points to a different VMCS. loaded_cpu_state points | ||
| 205 | * to the VMCS whose state is loaded into the CPU registers that only | ||
| 206 | * need to be switched when transitioning to/from the kernel; a NULL | ||
| 207 | * value indicates that host state is loaded. | ||
| 208 | */ | ||
| 209 | struct loaded_vmcs vmcs01; | ||
| 210 | struct loaded_vmcs *loaded_vmcs; | ||
| 211 | struct loaded_vmcs *loaded_cpu_state; | ||
| 212 | bool __launched; /* temporary, used in vmx_vcpu_run */ | ||
| 213 | struct msr_autoload { | ||
| 214 | struct vmx_msrs guest; | ||
| 215 | struct vmx_msrs host; | ||
| 216 | } msr_autoload; | ||
| 217 | |||
| 218 | struct { | ||
| 219 | int vm86_active; | ||
| 220 | ulong save_rflags; | ||
| 221 | struct kvm_segment segs[8]; | ||
| 222 | } rmode; | ||
| 223 | struct { | ||
| 224 | u32 bitmask; /* 4 bits per segment (1 bit per field) */ | ||
| 225 | struct kvm_save_segment { | ||
| 226 | u16 selector; | ||
| 227 | unsigned long base; | ||
| 228 | u32 limit; | ||
| 229 | u32 ar; | ||
| 230 | } seg[8]; | ||
| 231 | } segment_cache; | ||
| 232 | int vpid; | ||
| 233 | bool emulation_required; | ||
| 234 | |||
| 235 | u32 exit_reason; | ||
| 236 | |||
| 237 | /* Posted interrupt descriptor */ | ||
| 238 | struct pi_desc pi_desc; | ||
| 239 | |||
| 240 | /* Support for a guest hypervisor (nested VMX) */ | ||
| 241 | struct nested_vmx nested; | ||
| 242 | |||
| 243 | /* Dynamic PLE window. */ | ||
| 244 | int ple_window; | ||
| 245 | bool ple_window_dirty; | ||
| 246 | |||
| 247 | bool req_immediate_exit; | ||
| 248 | |||
| 249 | /* Support for PML */ | ||
| 250 | #define PML_ENTITY_NUM 512 | ||
| 251 | struct page *pml_pg; | ||
| 252 | |||
| 253 | /* apic deadline value in host tsc */ | ||
| 254 | u64 hv_deadline_tsc; | ||
| 255 | |||
| 256 | u64 current_tsc_ratio; | ||
| 257 | |||
| 258 | u32 host_pkru; | ||
| 259 | |||
| 260 | unsigned long host_debugctlmsr; | ||
| 261 | |||
| 262 | /* | ||
| 263 | * Only bits masked by msr_ia32_feature_control_valid_bits can be set in | ||
| 264 | * msr_ia32_feature_control. FEATURE_CONTROL_LOCKED is always included | ||
| 265 | * in msr_ia32_feature_control_valid_bits. | ||
| 266 | */ | ||
| 267 | u64 msr_ia32_feature_control; | ||
| 268 | u64 msr_ia32_feature_control_valid_bits; | ||
| 269 | u64 ept_pointer; | ||
| 270 | |||
| 271 | struct pt_desc pt_desc; | ||
| 272 | }; | ||
| 273 | |||
| 274 | enum ept_pointers_status { | ||
| 275 | EPT_POINTERS_CHECK = 0, | ||
| 276 | EPT_POINTERS_MATCH = 1, | ||
| 277 | EPT_POINTERS_MISMATCH = 2 | ||
| 278 | }; | ||
| 279 | |||
| 280 | struct kvm_vmx { | ||
| 281 | struct kvm kvm; | ||
| 282 | |||
| 283 | unsigned int tss_addr; | ||
| 284 | bool ept_identity_pagetable_done; | ||
| 285 | gpa_t ept_identity_map_addr; | ||
| 286 | |||
| 287 | enum ept_pointers_status ept_pointers_match; | ||
| 288 | spinlock_t ept_pointer_lock; | ||
| 289 | }; | ||
| 290 | |||
| 291 | bool nested_vmx_allowed(struct kvm_vcpu *vcpu); | ||
| 292 | void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu); | ||
| 293 | void vmx_vcpu_put(struct kvm_vcpu *vcpu); | ||
| 294 | int allocate_vpid(void); | ||
| 295 | void free_vpid(int vpid); | ||
| 296 | void vmx_set_constant_host_state(struct vcpu_vmx *vmx); | ||
| 297 | void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu); | ||
| 298 | int vmx_get_cpl(struct kvm_vcpu *vcpu); | ||
| 299 | unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu); | ||
| 300 | void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags); | ||
| 301 | u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu); | ||
| 302 | void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask); | ||
| 303 | void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer); | ||
| 304 | void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); | ||
| 305 | void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); | ||
| 306 | int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); | ||
| 307 | void set_cr4_guest_host_mask(struct vcpu_vmx *vmx); | ||
| 308 | void ept_save_pdptrs(struct kvm_vcpu *vcpu); | ||
| 309 | void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); | ||
| 310 | void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); | ||
| 311 | u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa); | ||
| 312 | void update_exception_bitmap(struct kvm_vcpu *vcpu); | ||
| 313 | void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu); | ||
| 314 | bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu); | ||
| 315 | void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked); | ||
| 316 | void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu); | ||
| 317 | struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr); | ||
| 318 | void pt_update_intercept_for_msr(struct vcpu_vmx *vmx); | ||
| 319 | |||
| 320 | #define POSTED_INTR_ON 0 | ||
| 321 | #define POSTED_INTR_SN 1 | ||
| 322 | |||
| 323 | static inline bool pi_test_and_set_on(struct pi_desc *pi_desc) | ||
| 324 | { | ||
| 325 | return test_and_set_bit(POSTED_INTR_ON, | ||
| 326 | (unsigned long *)&pi_desc->control); | ||
| 327 | } | ||
| 328 | |||
| 329 | static inline bool pi_test_and_clear_on(struct pi_desc *pi_desc) | ||
| 330 | { | ||
| 331 | return test_and_clear_bit(POSTED_INTR_ON, | ||
| 332 | (unsigned long *)&pi_desc->control); | ||
| 333 | } | ||
| 334 | |||
| 335 | static inline int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc) | ||
| 336 | { | ||
| 337 | return test_and_set_bit(vector, (unsigned long *)pi_desc->pir); | ||
| 338 | } | ||
| 339 | |||
| 340 | static inline void pi_clear_sn(struct pi_desc *pi_desc) | ||
| 341 | { | ||
| 342 | return clear_bit(POSTED_INTR_SN, | ||
| 343 | (unsigned long *)&pi_desc->control); | ||
| 344 | } | ||
| 345 | |||
| 346 | static inline void pi_set_sn(struct pi_desc *pi_desc) | ||
| 347 | { | ||
| 348 | return set_bit(POSTED_INTR_SN, | ||
| 349 | (unsigned long *)&pi_desc->control); | ||
| 350 | } | ||
| 351 | |||
| 352 | static inline void pi_clear_on(struct pi_desc *pi_desc) | ||
| 353 | { | ||
| 354 | clear_bit(POSTED_INTR_ON, | ||
| 355 | (unsigned long *)&pi_desc->control); | ||
| 356 | } | ||
| 357 | |||
| 358 | static inline int pi_test_on(struct pi_desc *pi_desc) | ||
| 359 | { | ||
| 360 | return test_bit(POSTED_INTR_ON, | ||
| 361 | (unsigned long *)&pi_desc->control); | ||
| 362 | } | ||
| 363 | |||
| 364 | static inline int pi_test_sn(struct pi_desc *pi_desc) | ||
| 365 | { | ||
| 366 | return test_bit(POSTED_INTR_SN, | ||
| 367 | (unsigned long *)&pi_desc->control); | ||
| 368 | } | ||
| 369 | |||
| 370 | static inline u8 vmx_get_rvi(void) | ||
| 371 | { | ||
| 372 | return vmcs_read16(GUEST_INTR_STATUS) & 0xff; | ||
| 373 | } | ||
| 374 | |||
| 375 | static inline void vm_entry_controls_reset_shadow(struct vcpu_vmx *vmx) | ||
| 376 | { | ||
| 377 | vmx->vm_entry_controls_shadow = vmcs_read32(VM_ENTRY_CONTROLS); | ||
| 378 | } | ||
| 379 | |||
| 380 | static inline void vm_entry_controls_init(struct vcpu_vmx *vmx, u32 val) | ||
| 381 | { | ||
| 382 | vmcs_write32(VM_ENTRY_CONTROLS, val); | ||
| 383 | vmx->vm_entry_controls_shadow = val; | ||
| 384 | } | ||
| 385 | |||
| 386 | static inline void vm_entry_controls_set(struct vcpu_vmx *vmx, u32 val) | ||
| 387 | { | ||
| 388 | if (vmx->vm_entry_controls_shadow != val) | ||
| 389 | vm_entry_controls_init(vmx, val); | ||
| 390 | } | ||
| 391 | |||
| 392 | static inline u32 vm_entry_controls_get(struct vcpu_vmx *vmx) | ||
| 393 | { | ||
| 394 | return vmx->vm_entry_controls_shadow; | ||
| 395 | } | ||
| 396 | |||
| 397 | static inline void vm_entry_controls_setbit(struct vcpu_vmx *vmx, u32 val) | ||
| 398 | { | ||
| 399 | vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) | val); | ||
| 400 | } | ||
| 401 | |||
| 402 | static inline void vm_entry_controls_clearbit(struct vcpu_vmx *vmx, u32 val) | ||
| 403 | { | ||
| 404 | vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) & ~val); | ||
| 405 | } | ||
| 406 | |||
| 407 | static inline void vm_exit_controls_reset_shadow(struct vcpu_vmx *vmx) | ||
| 408 | { | ||
| 409 | vmx->vm_exit_controls_shadow = vmcs_read32(VM_EXIT_CONTROLS); | ||
| 410 | } | ||
| 411 | |||
| 412 | static inline void vm_exit_controls_init(struct vcpu_vmx *vmx, u32 val) | ||
| 413 | { | ||
| 414 | vmcs_write32(VM_EXIT_CONTROLS, val); | ||
| 415 | vmx->vm_exit_controls_shadow = val; | ||
| 416 | } | ||
| 417 | |||
| 418 | static inline void vm_exit_controls_set(struct vcpu_vmx *vmx, u32 val) | ||
| 419 | { | ||
| 420 | if (vmx->vm_exit_controls_shadow != val) | ||
| 421 | vm_exit_controls_init(vmx, val); | ||
| 422 | } | ||
| 423 | |||
| 424 | static inline u32 vm_exit_controls_get(struct vcpu_vmx *vmx) | ||
| 425 | { | ||
| 426 | return vmx->vm_exit_controls_shadow; | ||
| 427 | } | ||
| 428 | |||
| 429 | static inline void vm_exit_controls_setbit(struct vcpu_vmx *vmx, u32 val) | ||
| 430 | { | ||
| 431 | vm_exit_controls_set(vmx, vm_exit_controls_get(vmx) | val); | ||
| 432 | } | ||
| 433 | |||
| 434 | static inline void vm_exit_controls_clearbit(struct vcpu_vmx *vmx, u32 val) | ||
| 435 | { | ||
| 436 | vm_exit_controls_set(vmx, vm_exit_controls_get(vmx) & ~val); | ||
| 437 | } | ||
| 438 | |||
| 439 | static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx) | ||
| 440 | { | ||
| 441 | vmx->segment_cache.bitmask = 0; | ||
| 442 | } | ||
| 443 | |||
| 444 | static inline u32 vmx_vmentry_ctrl(void) | ||
| 445 | { | ||
| 446 | u32 vmentry_ctrl = vmcs_config.vmentry_ctrl; | ||
| 447 | if (pt_mode == PT_MODE_SYSTEM) | ||
| 448 | vmentry_ctrl &= ~(VM_EXIT_PT_CONCEAL_PIP | VM_EXIT_CLEAR_IA32_RTIT_CTL); | ||
| 449 | /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */ | ||
| 450 | return vmentry_ctrl & | ||
| 451 | ~(VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | VM_ENTRY_LOAD_IA32_EFER); | ||
| 452 | } | ||
| 453 | |||
| 454 | static inline u32 vmx_vmexit_ctrl(void) | ||
| 455 | { | ||
| 456 | u32 vmexit_ctrl = vmcs_config.vmexit_ctrl; | ||
| 457 | if (pt_mode == PT_MODE_SYSTEM) | ||
| 458 | vmexit_ctrl &= ~(VM_ENTRY_PT_CONCEAL_PIP | VM_ENTRY_LOAD_IA32_RTIT_CTL); | ||
| 459 | /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */ | ||
| 460 | return vmcs_config.vmexit_ctrl & | ||
| 461 | ~(VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | VM_EXIT_LOAD_IA32_EFER); | ||
| 462 | } | ||
| 463 | |||
| 464 | u32 vmx_exec_control(struct vcpu_vmx *vmx); | ||
| 465 | |||
| 466 | static inline struct kvm_vmx *to_kvm_vmx(struct kvm *kvm) | ||
| 467 | { | ||
| 468 | return container_of(kvm, struct kvm_vmx, kvm); | ||
| 469 | } | ||
| 470 | |||
| 471 | static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) | ||
| 472 | { | ||
| 473 | return container_of(vcpu, struct vcpu_vmx, vcpu); | ||
| 474 | } | ||
| 475 | |||
| 476 | static inline struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu) | ||
| 477 | { | ||
| 478 | return &(to_vmx(vcpu)->pi_desc); | ||
| 479 | } | ||
| 480 | |||
| 481 | struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu); | ||
| 482 | void free_vmcs(struct vmcs *vmcs); | ||
| 483 | int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs); | ||
| 484 | void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs); | ||
| 485 | void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs); | ||
| 486 | void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs); | ||
| 487 | |||
| 488 | static inline struct vmcs *alloc_vmcs(bool shadow) | ||
| 489 | { | ||
| 490 | return alloc_vmcs_cpu(shadow, raw_smp_processor_id()); | ||
| 491 | } | ||
| 492 | |||
| 493 | u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa); | ||
| 494 | |||
| 495 | static inline void __vmx_flush_tlb(struct kvm_vcpu *vcpu, int vpid, | ||
| 496 | bool invalidate_gpa) | ||
| 497 | { | ||
| 498 | if (enable_ept && (invalidate_gpa || !enable_vpid)) { | ||
| 499 | if (!VALID_PAGE(vcpu->arch.mmu->root_hpa)) | ||
| 500 | return; | ||
| 501 | ept_sync_context(construct_eptp(vcpu, | ||
| 502 | vcpu->arch.mmu->root_hpa)); | ||
| 503 | } else { | ||
| 504 | vpid_sync_context(vpid); | ||
| 505 | } | ||
| 506 | } | ||
| 507 | |||
| 508 | static inline void vmx_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa) | ||
| 509 | { | ||
| 510 | __vmx_flush_tlb(vcpu, to_vmx(vcpu)->vpid, invalidate_gpa); | ||
| 511 | } | ||
| 512 | |||
| 513 | static inline void decache_tsc_multiplier(struct vcpu_vmx *vmx) | ||
| 514 | { | ||
| 515 | vmx->current_tsc_ratio = vmx->vcpu.arch.tsc_scaling_ratio; | ||
| 516 | vmcs_write64(TSC_MULTIPLIER, vmx->current_tsc_ratio); | ||
| 517 | } | ||
| 518 | |||
| 519 | #endif /* __KVM_X86_VMX_H */ | ||
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f049ecfac7bb..02c8e095a239 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c | |||
| @@ -69,6 +69,7 @@ | |||
| 69 | #include <asm/irq_remapping.h> | 69 | #include <asm/irq_remapping.h> |
| 70 | #include <asm/mshyperv.h> | 70 | #include <asm/mshyperv.h> |
| 71 | #include <asm/hypervisor.h> | 71 | #include <asm/hypervisor.h> |
| 72 | #include <asm/intel_pt.h> | ||
| 72 | 73 | ||
| 73 | #define CREATE_TRACE_POINTS | 74 | #define CREATE_TRACE_POINTS |
| 74 | #include "trace.h" | 75 | #include "trace.h" |
| @@ -213,6 +214,9 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { | |||
| 213 | 214 | ||
| 214 | u64 __read_mostly host_xcr0; | 215 | u64 __read_mostly host_xcr0; |
| 215 | 216 | ||
| 217 | struct kmem_cache *x86_fpu_cache; | ||
| 218 | EXPORT_SYMBOL_GPL(x86_fpu_cache); | ||
| 219 | |||
| 216 | static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt); | 220 | static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt); |
| 217 | 221 | ||
| 218 | static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu) | 222 | static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu) |
| @@ -1121,7 +1125,13 @@ static u32 msrs_to_save[] = { | |||
| 1121 | #endif | 1125 | #endif |
| 1122 | MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA, | 1126 | MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA, |
| 1123 | MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS, MSR_TSC_AUX, | 1127 | MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS, MSR_TSC_AUX, |
| 1124 | MSR_IA32_SPEC_CTRL, MSR_IA32_ARCH_CAPABILITIES | 1128 | MSR_IA32_SPEC_CTRL, MSR_IA32_ARCH_CAPABILITIES, |
| 1129 | MSR_IA32_RTIT_CTL, MSR_IA32_RTIT_STATUS, MSR_IA32_RTIT_CR3_MATCH, | ||
| 1130 | MSR_IA32_RTIT_OUTPUT_BASE, MSR_IA32_RTIT_OUTPUT_MASK, | ||
| 1131 | MSR_IA32_RTIT_ADDR0_A, MSR_IA32_RTIT_ADDR0_B, | ||
| 1132 | MSR_IA32_RTIT_ADDR1_A, MSR_IA32_RTIT_ADDR1_B, | ||
| 1133 | MSR_IA32_RTIT_ADDR2_A, MSR_IA32_RTIT_ADDR2_B, | ||
| 1134 | MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B, | ||
| 1125 | }; | 1135 | }; |
| 1126 | 1136 | ||
| 1127 | static unsigned num_msrs_to_save; | 1137 | static unsigned num_msrs_to_save; |
| @@ -2999,6 +3009,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) | |||
| 2999 | case KVM_CAP_HYPERV_TLBFLUSH: | 3009 | case KVM_CAP_HYPERV_TLBFLUSH: |
| 3000 | case KVM_CAP_HYPERV_SEND_IPI: | 3010 | case KVM_CAP_HYPERV_SEND_IPI: |
| 3001 | case KVM_CAP_HYPERV_ENLIGHTENED_VMCS: | 3011 | case KVM_CAP_HYPERV_ENLIGHTENED_VMCS: |
| 3012 | case KVM_CAP_HYPERV_CPUID: | ||
| 3002 | case KVM_CAP_PCI_SEGMENT: | 3013 | case KVM_CAP_PCI_SEGMENT: |
| 3003 | case KVM_CAP_DEBUGREGS: | 3014 | case KVM_CAP_DEBUGREGS: |
| 3004 | case KVM_CAP_X86_ROBUST_SINGLESTEP: | 3015 | case KVM_CAP_X86_ROBUST_SINGLESTEP: |
| @@ -3010,7 +3021,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) | |||
| 3010 | case KVM_CAP_HYPERV_TIME: | 3021 | case KVM_CAP_HYPERV_TIME: |
| 3011 | case KVM_CAP_IOAPIC_POLARITY_IGNORED: | 3022 | case KVM_CAP_IOAPIC_POLARITY_IGNORED: |
| 3012 | case KVM_CAP_TSC_DEADLINE_TIMER: | 3023 | case KVM_CAP_TSC_DEADLINE_TIMER: |
| 3013 | case KVM_CAP_ENABLE_CAP_VM: | ||
| 3014 | case KVM_CAP_DISABLE_QUIRKS: | 3024 | case KVM_CAP_DISABLE_QUIRKS: |
| 3015 | case KVM_CAP_SET_BOOT_CPU_ID: | 3025 | case KVM_CAP_SET_BOOT_CPU_ID: |
| 3016 | case KVM_CAP_SPLIT_IRQCHIP: | 3026 | case KVM_CAP_SPLIT_IRQCHIP: |
| @@ -3632,7 +3642,7 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, | |||
| 3632 | 3642 | ||
| 3633 | static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu) | 3643 | static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu) |
| 3634 | { | 3644 | { |
| 3635 | struct xregs_state *xsave = &vcpu->arch.guest_fpu.state.xsave; | 3645 | struct xregs_state *xsave = &vcpu->arch.guest_fpu->state.xsave; |
| 3636 | u64 xstate_bv = xsave->header.xfeatures; | 3646 | u64 xstate_bv = xsave->header.xfeatures; |
| 3637 | u64 valid; | 3647 | u64 valid; |
| 3638 | 3648 | ||
| @@ -3674,7 +3684,7 @@ static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu) | |||
| 3674 | 3684 | ||
| 3675 | static void load_xsave(struct kvm_vcpu *vcpu, u8 *src) | 3685 | static void load_xsave(struct kvm_vcpu *vcpu, u8 *src) |
| 3676 | { | 3686 | { |
| 3677 | struct xregs_state *xsave = &vcpu->arch.guest_fpu.state.xsave; | 3687 | struct xregs_state *xsave = &vcpu->arch.guest_fpu->state.xsave; |
| 3678 | u64 xstate_bv = *(u64 *)(src + XSAVE_HDR_OFFSET); | 3688 | u64 xstate_bv = *(u64 *)(src + XSAVE_HDR_OFFSET); |
| 3679 | u64 valid; | 3689 | u64 valid; |
| 3680 | 3690 | ||
| @@ -3722,7 +3732,7 @@ static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu, | |||
| 3722 | fill_xsave((u8 *) guest_xsave->region, vcpu); | 3732 | fill_xsave((u8 *) guest_xsave->region, vcpu); |
| 3723 | } else { | 3733 | } else { |
| 3724 | memcpy(guest_xsave->region, | 3734 | memcpy(guest_xsave->region, |
| 3725 | &vcpu->arch.guest_fpu.state.fxsave, | 3735 | &vcpu->arch.guest_fpu->state.fxsave, |
| 3726 | sizeof(struct fxregs_state)); | 3736 | sizeof(struct fxregs_state)); |
| 3727 | *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] = | 3737 | *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] = |
| 3728 | XFEATURE_MASK_FPSSE; | 3738 | XFEATURE_MASK_FPSSE; |
| @@ -3752,7 +3762,7 @@ static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu, | |||
| 3752 | if (xstate_bv & ~XFEATURE_MASK_FPSSE || | 3762 | if (xstate_bv & ~XFEATURE_MASK_FPSSE || |
| 3753 | mxcsr & ~mxcsr_feature_mask) | 3763 | mxcsr & ~mxcsr_feature_mask) |
| 3754 | return -EINVAL; | 3764 | return -EINVAL; |
| 3755 | memcpy(&vcpu->arch.guest_fpu.state.fxsave, | 3765 | memcpy(&vcpu->arch.guest_fpu->state.fxsave, |
| 3756 | guest_xsave->region, sizeof(struct fxregs_state)); | 3766 | guest_xsave->region, sizeof(struct fxregs_state)); |
| 3757 | } | 3767 | } |
| 3758 | return 0; | 3768 | return 0; |
| @@ -3830,6 +3840,8 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, | |||
| 3830 | return kvm_hv_activate_synic(vcpu, cap->cap == | 3840 | return kvm_hv_activate_synic(vcpu, cap->cap == |
| 3831 | KVM_CAP_HYPERV_SYNIC2); | 3841 | KVM_CAP_HYPERV_SYNIC2); |
| 3832 | case KVM_CAP_HYPERV_ENLIGHTENED_VMCS: | 3842 | case KVM_CAP_HYPERV_ENLIGHTENED_VMCS: |
| 3843 | if (!kvm_x86_ops->nested_enable_evmcs) | ||
| 3844 | return -ENOTTY; | ||
| 3833 | r = kvm_x86_ops->nested_enable_evmcs(vcpu, &vmcs_version); | 3845 | r = kvm_x86_ops->nested_enable_evmcs(vcpu, &vmcs_version); |
| 3834 | if (!r) { | 3846 | if (!r) { |
| 3835 | user_ptr = (void __user *)(uintptr_t)cap->args[0]; | 3847 | user_ptr = (void __user *)(uintptr_t)cap->args[0]; |
| @@ -4192,6 +4204,25 @@ long kvm_arch_vcpu_ioctl(struct file *filp, | |||
| 4192 | r = kvm_x86_ops->set_nested_state(vcpu, user_kvm_nested_state, &kvm_state); | 4204 | r = kvm_x86_ops->set_nested_state(vcpu, user_kvm_nested_state, &kvm_state); |
| 4193 | break; | 4205 | break; |
| 4194 | } | 4206 | } |
| 4207 | case KVM_GET_SUPPORTED_HV_CPUID: { | ||
| 4208 | struct kvm_cpuid2 __user *cpuid_arg = argp; | ||
| 4209 | struct kvm_cpuid2 cpuid; | ||
| 4210 | |||
| 4211 | r = -EFAULT; | ||
| 4212 | if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid))) | ||
| 4213 | goto out; | ||
| 4214 | |||
| 4215 | r = kvm_vcpu_ioctl_get_hv_cpuid(vcpu, &cpuid, | ||
| 4216 | cpuid_arg->entries); | ||
| 4217 | if (r) | ||
| 4218 | goto out; | ||
| 4219 | |||
| 4220 | r = -EFAULT; | ||
| 4221 | if (copy_to_user(cpuid_arg, &cpuid, sizeof(cpuid))) | ||
| 4222 | goto out; | ||
| 4223 | r = 0; | ||
| 4224 | break; | ||
| 4225 | } | ||
| 4195 | default: | 4226 | default: |
| 4196 | r = -EINVAL; | 4227 | r = -EINVAL; |
| 4197 | } | 4228 | } |
| @@ -4396,7 +4427,7 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm, | |||
| 4396 | */ | 4427 | */ |
| 4397 | int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) | 4428 | int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) |
| 4398 | { | 4429 | { |
| 4399 | bool is_dirty = false; | 4430 | bool flush = false; |
| 4400 | int r; | 4431 | int r; |
| 4401 | 4432 | ||
| 4402 | mutex_lock(&kvm->slots_lock); | 4433 | mutex_lock(&kvm->slots_lock); |
| @@ -4407,14 +4438,41 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) | |||
| 4407 | if (kvm_x86_ops->flush_log_dirty) | 4438 | if (kvm_x86_ops->flush_log_dirty) |
| 4408 | kvm_x86_ops->flush_log_dirty(kvm); | 4439 | kvm_x86_ops->flush_log_dirty(kvm); |
| 4409 | 4440 | ||
| 4410 | r = kvm_get_dirty_log_protect(kvm, log, &is_dirty); | 4441 | r = kvm_get_dirty_log_protect(kvm, log, &flush); |
| 4411 | 4442 | ||
| 4412 | /* | 4443 | /* |
| 4413 | * All the TLBs can be flushed out of mmu lock, see the comments in | 4444 | * All the TLBs can be flushed out of mmu lock, see the comments in |
| 4414 | * kvm_mmu_slot_remove_write_access(). | 4445 | * kvm_mmu_slot_remove_write_access(). |
| 4415 | */ | 4446 | */ |
| 4416 | lockdep_assert_held(&kvm->slots_lock); | 4447 | lockdep_assert_held(&kvm->slots_lock); |
| 4417 | if (is_dirty) | 4448 | if (flush) |
| 4449 | kvm_flush_remote_tlbs(kvm); | ||
| 4450 | |||
| 4451 | mutex_unlock(&kvm->slots_lock); | ||
| 4452 | return r; | ||
| 4453 | } | ||
| 4454 | |||
| 4455 | int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm, struct kvm_clear_dirty_log *log) | ||
| 4456 | { | ||
| 4457 | bool flush = false; | ||
| 4458 | int r; | ||
| 4459 | |||
| 4460 | mutex_lock(&kvm->slots_lock); | ||
| 4461 | |||
| 4462 | /* | ||
| 4463 | * Flush potentially hardware-cached dirty pages to dirty_bitmap. | ||
| 4464 | */ | ||
| 4465 | if (kvm_x86_ops->flush_log_dirty) | ||
| 4466 | kvm_x86_ops->flush_log_dirty(kvm); | ||
| 4467 | |||
| 4468 | r = kvm_clear_dirty_log_protect(kvm, log, &flush); | ||
| 4469 | |||
| 4470 | /* | ||
| 4471 | * All the TLBs can be flushed out of mmu lock, see the comments in | ||
| 4472 | * kvm_mmu_slot_remove_write_access(). | ||
| 4473 | */ | ||
| 4474 | lockdep_assert_held(&kvm->slots_lock); | ||
| 4475 | if (flush) | ||
| 4418 | kvm_flush_remote_tlbs(kvm); | 4476 | kvm_flush_remote_tlbs(kvm); |
| 4419 | 4477 | ||
| 4420 | mutex_unlock(&kvm->slots_lock); | 4478 | mutex_unlock(&kvm->slots_lock); |
| @@ -4433,8 +4491,8 @@ int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event, | |||
| 4433 | return 0; | 4491 | return 0; |
| 4434 | } | 4492 | } |
| 4435 | 4493 | ||
| 4436 | static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, | 4494 | int kvm_vm_ioctl_enable_cap(struct kvm *kvm, |
| 4437 | struct kvm_enable_cap *cap) | 4495 | struct kvm_enable_cap *cap) |
| 4438 | { | 4496 | { |
| 4439 | int r; | 4497 | int r; |
| 4440 | 4498 | ||
| @@ -4767,15 +4825,6 @@ set_identity_unlock: | |||
| 4767 | r = 0; | 4825 | r = 0; |
| 4768 | break; | 4826 | break; |
| 4769 | } | 4827 | } |
| 4770 | case KVM_ENABLE_CAP: { | ||
| 4771 | struct kvm_enable_cap cap; | ||
| 4772 | |||
| 4773 | r = -EFAULT; | ||
| 4774 | if (copy_from_user(&cap, argp, sizeof(cap))) | ||
| 4775 | goto out; | ||
| 4776 | r = kvm_vm_ioctl_enable_cap(kvm, &cap); | ||
| 4777 | break; | ||
| 4778 | } | ||
| 4779 | case KVM_MEMORY_ENCRYPT_OP: { | 4828 | case KVM_MEMORY_ENCRYPT_OP: { |
| 4780 | r = -ENOTTY; | 4829 | r = -ENOTTY; |
| 4781 | if (kvm_x86_ops->mem_enc_op) | 4830 | if (kvm_x86_ops->mem_enc_op) |
| @@ -4844,6 +4893,30 @@ static void kvm_init_msr_list(void) | |||
| 4844 | if (!kvm_x86_ops->rdtscp_supported()) | 4893 | if (!kvm_x86_ops->rdtscp_supported()) |
| 4845 | continue; | 4894 | continue; |
| 4846 | break; | 4895 | break; |
| 4896 | case MSR_IA32_RTIT_CTL: | ||
| 4897 | case MSR_IA32_RTIT_STATUS: | ||
| 4898 | if (!kvm_x86_ops->pt_supported()) | ||
| 4899 | continue; | ||
| 4900 | break; | ||
| 4901 | case MSR_IA32_RTIT_CR3_MATCH: | ||
| 4902 | if (!kvm_x86_ops->pt_supported() || | ||
| 4903 | !intel_pt_validate_hw_cap(PT_CAP_cr3_filtering)) | ||
| 4904 | continue; | ||
| 4905 | break; | ||
| 4906 | case MSR_IA32_RTIT_OUTPUT_BASE: | ||
| 4907 | case MSR_IA32_RTIT_OUTPUT_MASK: | ||
| 4908 | if (!kvm_x86_ops->pt_supported() || | ||
| 4909 | (!intel_pt_validate_hw_cap(PT_CAP_topa_output) && | ||
| 4910 | !intel_pt_validate_hw_cap(PT_CAP_single_range_output))) | ||
| 4911 | continue; | ||
| 4912 | break; | ||
| 4913 | case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: { | ||
| 4914 | if (!kvm_x86_ops->pt_supported() || | ||
| 4915 | msrs_to_save[i] - MSR_IA32_RTIT_ADDR0_A >= | ||
| 4916 | intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2) | ||
| 4917 | continue; | ||
| 4918 | break; | ||
| 4919 | } | ||
| 4847 | default: | 4920 | default: |
| 4848 | break; | 4921 | break; |
| 4849 | } | 4922 | } |
| @@ -6815,11 +6888,30 @@ int kvm_arch_init(void *opaque) | |||
| 6815 | goto out; | 6888 | goto out; |
| 6816 | } | 6889 | } |
| 6817 | 6890 | ||
| 6891 | /* | ||
| 6892 | * KVM explicitly assumes that the guest has an FPU and | ||
| 6893 | * FXSAVE/FXRSTOR. For example, the KVM_GET_FPU explicitly casts the | ||
| 6894 | * vCPU's FPU state as a fxregs_state struct. | ||
| 6895 | */ | ||
| 6896 | if (!boot_cpu_has(X86_FEATURE_FPU) || !boot_cpu_has(X86_FEATURE_FXSR)) { | ||
| 6897 | printk(KERN_ERR "kvm: inadequate fpu\n"); | ||
| 6898 | r = -EOPNOTSUPP; | ||
| 6899 | goto out; | ||
| 6900 | } | ||
| 6901 | |||
| 6818 | r = -ENOMEM; | 6902 | r = -ENOMEM; |
| 6903 | x86_fpu_cache = kmem_cache_create("x86_fpu", sizeof(struct fpu), | ||
| 6904 | __alignof__(struct fpu), SLAB_ACCOUNT, | ||
| 6905 | NULL); | ||
| 6906 | if (!x86_fpu_cache) { | ||
| 6907 | printk(KERN_ERR "kvm: failed to allocate cache for x86 fpu\n"); | ||
| 6908 | goto out; | ||
| 6909 | } | ||
| 6910 | |||
| 6819 | shared_msrs = alloc_percpu(struct kvm_shared_msrs); | 6911 | shared_msrs = alloc_percpu(struct kvm_shared_msrs); |
| 6820 | if (!shared_msrs) { | 6912 | if (!shared_msrs) { |
| 6821 | printk(KERN_ERR "kvm: failed to allocate percpu kvm_shared_msrs\n"); | 6913 | printk(KERN_ERR "kvm: failed to allocate percpu kvm_shared_msrs\n"); |
| 6822 | goto out; | 6914 | goto out_free_x86_fpu_cache; |
| 6823 | } | 6915 | } |
| 6824 | 6916 | ||
| 6825 | r = kvm_mmu_module_init(); | 6917 | r = kvm_mmu_module_init(); |
| @@ -6852,6 +6944,8 @@ int kvm_arch_init(void *opaque) | |||
| 6852 | 6944 | ||
| 6853 | out_free_percpu: | 6945 | out_free_percpu: |
| 6854 | free_percpu(shared_msrs); | 6946 | free_percpu(shared_msrs); |
| 6947 | out_free_x86_fpu_cache: | ||
| 6948 | kmem_cache_destroy(x86_fpu_cache); | ||
| 6855 | out: | 6949 | out: |
| 6856 | return r; | 6950 | return r; |
| 6857 | } | 6951 | } |
| @@ -6875,6 +6969,7 @@ void kvm_arch_exit(void) | |||
| 6875 | kvm_x86_ops = NULL; | 6969 | kvm_x86_ops = NULL; |
| 6876 | kvm_mmu_module_exit(); | 6970 | kvm_mmu_module_exit(); |
| 6877 | free_percpu(shared_msrs); | 6971 | free_percpu(shared_msrs); |
| 6972 | kmem_cache_destroy(x86_fpu_cache); | ||
| 6878 | } | 6973 | } |
| 6879 | 6974 | ||
| 6880 | int kvm_vcpu_halt(struct kvm_vcpu *vcpu) | 6975 | int kvm_vcpu_halt(struct kvm_vcpu *vcpu) |
| @@ -7998,9 +8093,9 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu) | |||
| 7998 | static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) | 8093 | static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) |
| 7999 | { | 8094 | { |
| 8000 | preempt_disable(); | 8095 | preempt_disable(); |
| 8001 | copy_fpregs_to_fpstate(&vcpu->arch.user_fpu); | 8096 | copy_fpregs_to_fpstate(¤t->thread.fpu); |
| 8002 | /* PKRU is separately restored in kvm_x86_ops->run. */ | 8097 | /* PKRU is separately restored in kvm_x86_ops->run. */ |
| 8003 | __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu.state, | 8098 | __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu->state, |
| 8004 | ~XFEATURE_MASK_PKRU); | 8099 | ~XFEATURE_MASK_PKRU); |
| 8005 | preempt_enable(); | 8100 | preempt_enable(); |
| 8006 | trace_kvm_fpu(1); | 8101 | trace_kvm_fpu(1); |
| @@ -8010,8 +8105,8 @@ static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) | |||
| 8010 | static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) | 8105 | static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) |
| 8011 | { | 8106 | { |
| 8012 | preempt_disable(); | 8107 | preempt_disable(); |
| 8013 | copy_fpregs_to_fpstate(&vcpu->arch.guest_fpu); | 8108 | copy_fpregs_to_fpstate(vcpu->arch.guest_fpu); |
| 8014 | copy_kernel_to_fpregs(&vcpu->arch.user_fpu.state); | 8109 | copy_kernel_to_fpregs(¤t->thread.fpu.state); |
| 8015 | preempt_enable(); | 8110 | preempt_enable(); |
| 8016 | ++vcpu->stat.fpu_reload; | 8111 | ++vcpu->stat.fpu_reload; |
| 8017 | trace_kvm_fpu(0); | 8112 | trace_kvm_fpu(0); |
| @@ -8505,7 +8600,7 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) | |||
| 8505 | 8600 | ||
| 8506 | vcpu_load(vcpu); | 8601 | vcpu_load(vcpu); |
| 8507 | 8602 | ||
| 8508 | fxsave = &vcpu->arch.guest_fpu.state.fxsave; | 8603 | fxsave = &vcpu->arch.guest_fpu->state.fxsave; |
| 8509 | memcpy(fpu->fpr, fxsave->st_space, 128); | 8604 | memcpy(fpu->fpr, fxsave->st_space, 128); |
| 8510 | fpu->fcw = fxsave->cwd; | 8605 | fpu->fcw = fxsave->cwd; |
| 8511 | fpu->fsw = fxsave->swd; | 8606 | fpu->fsw = fxsave->swd; |
| @@ -8525,7 +8620,7 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) | |||
| 8525 | 8620 | ||
| 8526 | vcpu_load(vcpu); | 8621 | vcpu_load(vcpu); |
| 8527 | 8622 | ||
| 8528 | fxsave = &vcpu->arch.guest_fpu.state.fxsave; | 8623 | fxsave = &vcpu->arch.guest_fpu->state.fxsave; |
| 8529 | 8624 | ||
| 8530 | memcpy(fxsave->st_space, fpu->fpr, 128); | 8625 | memcpy(fxsave->st_space, fpu->fpr, 128); |
| 8531 | fxsave->cwd = fpu->fcw; | 8626 | fxsave->cwd = fpu->fcw; |
| @@ -8581,9 +8676,9 @@ static int sync_regs(struct kvm_vcpu *vcpu) | |||
| 8581 | 8676 | ||
| 8582 | static void fx_init(struct kvm_vcpu *vcpu) | 8677 | static void fx_init(struct kvm_vcpu *vcpu) |
| 8583 | { | 8678 | { |
| 8584 | fpstate_init(&vcpu->arch.guest_fpu.state); | 8679 | fpstate_init(&vcpu->arch.guest_fpu->state); |
| 8585 | if (boot_cpu_has(X86_FEATURE_XSAVES)) | 8680 | if (boot_cpu_has(X86_FEATURE_XSAVES)) |
| 8586 | vcpu->arch.guest_fpu.state.xsave.header.xcomp_bv = | 8681 | vcpu->arch.guest_fpu->state.xsave.header.xcomp_bv = |
| 8587 | host_xcr0 | XSTATE_COMPACTION_ENABLED; | 8682 | host_xcr0 | XSTATE_COMPACTION_ENABLED; |
| 8588 | 8683 | ||
| 8589 | /* | 8684 | /* |
| @@ -8621,6 +8716,7 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, | |||
| 8621 | 8716 | ||
| 8622 | int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) | 8717 | int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) |
| 8623 | { | 8718 | { |
| 8719 | vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT; | ||
| 8624 | kvm_vcpu_mtrr_init(vcpu); | 8720 | kvm_vcpu_mtrr_init(vcpu); |
| 8625 | vcpu_load(vcpu); | 8721 | vcpu_load(vcpu); |
| 8626 | kvm_vcpu_reset(vcpu, false); | 8722 | kvm_vcpu_reset(vcpu, false); |
| @@ -8707,11 +8803,11 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) | |||
| 8707 | */ | 8803 | */ |
| 8708 | if (init_event) | 8804 | if (init_event) |
| 8709 | kvm_put_guest_fpu(vcpu); | 8805 | kvm_put_guest_fpu(vcpu); |
| 8710 | mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu.state.xsave, | 8806 | mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu->state.xsave, |
| 8711 | XFEATURE_MASK_BNDREGS); | 8807 | XFEATURE_MASK_BNDREGS); |
| 8712 | if (mpx_state_buffer) | 8808 | if (mpx_state_buffer) |
| 8713 | memset(mpx_state_buffer, 0, sizeof(struct mpx_bndreg_state)); | 8809 | memset(mpx_state_buffer, 0, sizeof(struct mpx_bndreg_state)); |
| 8714 | mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu.state.xsave, | 8810 | mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu->state.xsave, |
| 8715 | XFEATURE_MASK_BNDCSR); | 8811 | XFEATURE_MASK_BNDCSR); |
| 8716 | if (mpx_state_buffer) | 8812 | if (mpx_state_buffer) |
| 8717 | memset(mpx_state_buffer, 0, sizeof(struct mpx_bndcsr)); | 8813 | memset(mpx_state_buffer, 0, sizeof(struct mpx_bndcsr)); |
| @@ -8723,7 +8819,6 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) | |||
| 8723 | kvm_pmu_reset(vcpu); | 8819 | kvm_pmu_reset(vcpu); |
| 8724 | vcpu->arch.smbase = 0x30000; | 8820 | vcpu->arch.smbase = 0x30000; |
| 8725 | 8821 | ||
| 8726 | vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT; | ||
| 8727 | vcpu->arch.msr_misc_features_enables = 0; | 8822 | vcpu->arch.msr_misc_features_enables = 0; |
| 8728 | 8823 | ||
| 8729 | vcpu->arch.xcr0 = XFEATURE_MASK_FP; | 8824 | vcpu->arch.xcr0 = XFEATURE_MASK_FP; |
| @@ -9282,7 +9377,7 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, | |||
| 9282 | * with dirty logging disabled in order to eliminate unnecessary GPA | 9377 | * with dirty logging disabled in order to eliminate unnecessary GPA |
| 9283 | * logging in PML buffer (and potential PML buffer full VMEXT). This | 9378 | * logging in PML buffer (and potential PML buffer full VMEXT). This |
| 9284 | * guarantees leaving PML enabled during guest's lifetime won't have | 9379 | * guarantees leaving PML enabled during guest's lifetime won't have |
| 9285 | * any additonal overhead from PML when guest is running with dirty | 9380 | * any additional overhead from PML when guest is running with dirty |
| 9286 | * logging disabled for memory slots. | 9381 | * logging disabled for memory slots. |
| 9287 | * | 9382 | * |
| 9288 | * kvm_x86_ops->slot_enable_log_dirty is called when switching new slot | 9383 | * kvm_x86_ops->slot_enable_log_dirty is called when switching new slot |
diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c index 332d7c34be5c..11273cd384d6 100644 --- a/drivers/hv/hv.c +++ b/drivers/hv/hv.c | |||
| @@ -143,7 +143,7 @@ static int hv_ce_shutdown(struct clock_event_device *evt) | |||
| 143 | 143 | ||
| 144 | static int hv_ce_set_oneshot(struct clock_event_device *evt) | 144 | static int hv_ce_set_oneshot(struct clock_event_device *evt) |
| 145 | { | 145 | { |
| 146 | union hv_timer_config timer_cfg; | 146 | union hv_stimer_config timer_cfg; |
| 147 | 147 | ||
| 148 | timer_cfg.as_uint64 = 0; | 148 | timer_cfg.as_uint64 = 0; |
| 149 | timer_cfg.enable = 1; | 149 | timer_cfg.enable = 1; |
diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h index 87d3d7da78f8..ea201034b248 100644 --- a/drivers/hv/hyperv_vmbus.h +++ b/drivers/hv/hyperv_vmbus.h | |||
| @@ -44,74 +44,6 @@ | |||
| 44 | */ | 44 | */ |
| 45 | #define HV_UTIL_NEGO_TIMEOUT 55 | 45 | #define HV_UTIL_NEGO_TIMEOUT 55 |
| 46 | 46 | ||
| 47 | /* Define synthetic interrupt controller flag constants. */ | ||
| 48 | #define HV_EVENT_FLAGS_COUNT (256 * 8) | ||
| 49 | #define HV_EVENT_FLAGS_LONG_COUNT (256 / sizeof(unsigned long)) | ||
| 50 | |||
| 51 | /* | ||
| 52 | * Timer configuration register. | ||
| 53 | */ | ||
| 54 | union hv_timer_config { | ||
| 55 | u64 as_uint64; | ||
| 56 | struct { | ||
| 57 | u64 enable:1; | ||
| 58 | u64 periodic:1; | ||
| 59 | u64 lazy:1; | ||
| 60 | u64 auto_enable:1; | ||
| 61 | u64 apic_vector:8; | ||
| 62 | u64 direct_mode:1; | ||
| 63 | u64 reserved_z0:3; | ||
| 64 | u64 sintx:4; | ||
| 65 | u64 reserved_z1:44; | ||
| 66 | }; | ||
| 67 | }; | ||
| 68 | |||
| 69 | |||
| 70 | /* Define the synthetic interrupt controller event flags format. */ | ||
| 71 | union hv_synic_event_flags { | ||
| 72 | unsigned long flags[HV_EVENT_FLAGS_LONG_COUNT]; | ||
| 73 | }; | ||
| 74 | |||
| 75 | /* Define SynIC control register. */ | ||
| 76 | union hv_synic_scontrol { | ||
| 77 | u64 as_uint64; | ||
| 78 | struct { | ||
| 79 | u64 enable:1; | ||
| 80 | u64 reserved:63; | ||
| 81 | }; | ||
| 82 | }; | ||
| 83 | |||
| 84 | /* Define synthetic interrupt source. */ | ||
| 85 | union hv_synic_sint { | ||
| 86 | u64 as_uint64; | ||
| 87 | struct { | ||
| 88 | u64 vector:8; | ||
| 89 | u64 reserved1:8; | ||
| 90 | u64 masked:1; | ||
| 91 | u64 auto_eoi:1; | ||
| 92 | u64 reserved2:46; | ||
| 93 | }; | ||
| 94 | }; | ||
| 95 | |||
| 96 | /* Define the format of the SIMP register */ | ||
| 97 | union hv_synic_simp { | ||
| 98 | u64 as_uint64; | ||
| 99 | struct { | ||
| 100 | u64 simp_enabled:1; | ||
| 101 | u64 preserved:11; | ||
| 102 | u64 base_simp_gpa:52; | ||
| 103 | }; | ||
| 104 | }; | ||
| 105 | |||
| 106 | /* Define the format of the SIEFP register */ | ||
| 107 | union hv_synic_siefp { | ||
| 108 | u64 as_uint64; | ||
| 109 | struct { | ||
| 110 | u64 siefp_enabled:1; | ||
| 111 | u64 preserved:11; | ||
| 112 | u64 base_siefp_gpa:52; | ||
| 113 | }; | ||
| 114 | }; | ||
| 115 | 47 | ||
| 116 | /* Definitions for the monitored notification facility */ | 48 | /* Definitions for the monitored notification facility */ |
| 117 | union hv_monitor_trigger_group { | 49 | union hv_monitor_trigger_group { |
diff --git a/include/kvm/arm_arch_timer.h b/include/kvm/arm_arch_timer.h index 6502feb9524b..33771352dcd6 100644 --- a/include/kvm/arm_arch_timer.h +++ b/include/kvm/arm_arch_timer.h | |||
| @@ -21,7 +21,6 @@ | |||
| 21 | 21 | ||
| 22 | #include <linux/clocksource.h> | 22 | #include <linux/clocksource.h> |
| 23 | #include <linux/hrtimer.h> | 23 | #include <linux/hrtimer.h> |
| 24 | #include <linux/workqueue.h> | ||
| 25 | 24 | ||
| 26 | struct arch_timer_context { | 25 | struct arch_timer_context { |
| 27 | /* Registers: control register, timer value */ | 26 | /* Registers: control register, timer value */ |
| @@ -52,9 +51,6 @@ struct arch_timer_cpu { | |||
| 52 | /* Background timer used when the guest is not running */ | 51 | /* Background timer used when the guest is not running */ |
| 53 | struct hrtimer bg_timer; | 52 | struct hrtimer bg_timer; |
| 54 | 53 | ||
| 55 | /* Work queued with the above timer expires */ | ||
| 56 | struct work_struct expired; | ||
| 57 | |||
| 58 | /* Physical timer emulation */ | 54 | /* Physical timer emulation */ |
| 59 | struct hrtimer phys_timer; | 55 | struct hrtimer phys_timer; |
| 60 | 56 | ||
diff --git a/include/linux/compiler_attributes.h b/include/linux/compiler_attributes.h index f8c400ba1929..fe07b680dd4a 100644 --- a/include/linux/compiler_attributes.h +++ b/include/linux/compiler_attributes.h | |||
| @@ -37,7 +37,6 @@ | |||
| 37 | # define __GCC4_has_attribute___designated_init__ 0 | 37 | # define __GCC4_has_attribute___designated_init__ 0 |
| 38 | # define __GCC4_has_attribute___externally_visible__ 1 | 38 | # define __GCC4_has_attribute___externally_visible__ 1 |
| 39 | # define __GCC4_has_attribute___noclone__ 1 | 39 | # define __GCC4_has_attribute___noclone__ 1 |
| 40 | # define __GCC4_has_attribute___optimize__ 1 | ||
| 41 | # define __GCC4_has_attribute___nonstring__ 0 | 40 | # define __GCC4_has_attribute___nonstring__ 0 |
| 42 | # define __GCC4_has_attribute___no_sanitize_address__ (__GNUC_MINOR__ >= 8) | 41 | # define __GCC4_has_attribute___no_sanitize_address__ (__GNUC_MINOR__ >= 8) |
| 43 | #endif | 42 | #endif |
| @@ -163,17 +162,11 @@ | |||
| 163 | 162 | ||
| 164 | /* | 163 | /* |
| 165 | * Optional: not supported by clang | 164 | * Optional: not supported by clang |
| 166 | * Note: icc does not recognize gcc's no-tracer | ||
| 167 | * | 165 | * |
| 168 | * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-noclone-function-attribute | 166 | * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-noclone-function-attribute |
| 169 | * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-optimize-function-attribute | ||
| 170 | */ | 167 | */ |
| 171 | #if __has_attribute(__noclone__) | 168 | #if __has_attribute(__noclone__) |
| 172 | # if __has_attribute(__optimize__) | 169 | # define __noclone __attribute__((__noclone__)) |
| 173 | # define __noclone __attribute__((__noclone__, __optimize__("no-tracer"))) | ||
| 174 | # else | ||
| 175 | # define __noclone __attribute__((__noclone__)) | ||
| 176 | # endif | ||
| 177 | #else | 170 | #else |
| 178 | # define __noclone | 171 | # define __noclone |
| 179 | #endif | 172 | #endif |
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index c926698040e0..c38cc5eb7e73 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h | |||
| @@ -449,6 +449,7 @@ struct kvm { | |||
| 449 | #endif | 449 | #endif |
| 450 | long tlbs_dirty; | 450 | long tlbs_dirty; |
| 451 | struct list_head devices; | 451 | struct list_head devices; |
| 452 | bool manual_dirty_log_protect; | ||
| 452 | struct dentry *debugfs_dentry; | 453 | struct dentry *debugfs_dentry; |
| 453 | struct kvm_stat_data **debugfs_stat_data; | 454 | struct kvm_stat_data **debugfs_stat_data; |
| 454 | struct srcu_struct srcu; | 455 | struct srcu_struct srcu; |
| @@ -694,7 +695,8 @@ int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, | |||
| 694 | int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, | 695 | int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, |
| 695 | void *data, unsigned long len); | 696 | void *data, unsigned long len); |
| 696 | int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, | 697 | int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, |
| 697 | void *data, int offset, unsigned long len); | 698 | void *data, unsigned int offset, |
| 699 | unsigned long len); | ||
| 698 | int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, | 700 | int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, |
| 699 | gpa_t gpa, unsigned long len); | 701 | gpa_t gpa, unsigned long len); |
| 700 | int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len); | 702 | int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len); |
| @@ -753,7 +755,9 @@ int kvm_get_dirty_log(struct kvm *kvm, | |||
| 753 | struct kvm_dirty_log *log, int *is_dirty); | 755 | struct kvm_dirty_log *log, int *is_dirty); |
| 754 | 756 | ||
| 755 | int kvm_get_dirty_log_protect(struct kvm *kvm, | 757 | int kvm_get_dirty_log_protect(struct kvm *kvm, |
| 756 | struct kvm_dirty_log *log, bool *is_dirty); | 758 | struct kvm_dirty_log *log, bool *flush); |
| 759 | int kvm_clear_dirty_log_protect(struct kvm *kvm, | ||
| 760 | struct kvm_clear_dirty_log *log, bool *flush); | ||
| 757 | 761 | ||
| 758 | void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, | 762 | void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, |
| 759 | struct kvm_memory_slot *slot, | 763 | struct kvm_memory_slot *slot, |
| @@ -762,9 +766,13 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, | |||
| 762 | 766 | ||
| 763 | int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, | 767 | int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, |
| 764 | struct kvm_dirty_log *log); | 768 | struct kvm_dirty_log *log); |
| 769 | int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm, | ||
| 770 | struct kvm_clear_dirty_log *log); | ||
| 765 | 771 | ||
| 766 | int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level, | 772 | int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level, |
| 767 | bool line_status); | 773 | bool line_status); |
| 774 | int kvm_vm_ioctl_enable_cap(struct kvm *kvm, | ||
| 775 | struct kvm_enable_cap *cap); | ||
| 768 | long kvm_arch_vm_ioctl(struct file *filp, | 776 | long kvm_arch_vm_ioctl(struct file *filp, |
| 769 | unsigned int ioctl, unsigned long arg); | 777 | unsigned int ioctl, unsigned long arg); |
| 770 | 778 | ||
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 2b7a652c9fa4..6d4ea4b6c922 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h | |||
| @@ -492,6 +492,17 @@ struct kvm_dirty_log { | |||
| 492 | }; | 492 | }; |
| 493 | }; | 493 | }; |
| 494 | 494 | ||
| 495 | /* for KVM_CLEAR_DIRTY_LOG */ | ||
| 496 | struct kvm_clear_dirty_log { | ||
| 497 | __u32 slot; | ||
| 498 | __u32 num_pages; | ||
| 499 | __u64 first_page; | ||
| 500 | union { | ||
| 501 | void __user *dirty_bitmap; /* one bit per page */ | ||
| 502 | __u64 padding2; | ||
| 503 | }; | ||
| 504 | }; | ||
| 505 | |||
| 495 | /* for KVM_SET_SIGNAL_MASK */ | 506 | /* for KVM_SET_SIGNAL_MASK */ |
| 496 | struct kvm_signal_mask { | 507 | struct kvm_signal_mask { |
| 497 | __u32 len; | 508 | __u32 len; |
| @@ -975,6 +986,8 @@ struct kvm_ppc_resize_hpt { | |||
| 975 | #define KVM_CAP_HYPERV_ENLIGHTENED_VMCS 163 | 986 | #define KVM_CAP_HYPERV_ENLIGHTENED_VMCS 163 |
| 976 | #define KVM_CAP_EXCEPTION_PAYLOAD 164 | 987 | #define KVM_CAP_EXCEPTION_PAYLOAD 164 |
| 977 | #define KVM_CAP_ARM_VM_IPA_SIZE 165 | 988 | #define KVM_CAP_ARM_VM_IPA_SIZE 165 |
| 989 | #define KVM_CAP_MANUAL_DIRTY_LOG_PROTECT 166 | ||
| 990 | #define KVM_CAP_HYPERV_CPUID 167 | ||
| 978 | 991 | ||
| 979 | #ifdef KVM_CAP_IRQ_ROUTING | 992 | #ifdef KVM_CAP_IRQ_ROUTING |
| 980 | 993 | ||
| @@ -1421,6 +1434,12 @@ struct kvm_enc_region { | |||
| 1421 | #define KVM_GET_NESTED_STATE _IOWR(KVMIO, 0xbe, struct kvm_nested_state) | 1434 | #define KVM_GET_NESTED_STATE _IOWR(KVMIO, 0xbe, struct kvm_nested_state) |
| 1422 | #define KVM_SET_NESTED_STATE _IOW(KVMIO, 0xbf, struct kvm_nested_state) | 1435 | #define KVM_SET_NESTED_STATE _IOW(KVMIO, 0xbf, struct kvm_nested_state) |
| 1423 | 1436 | ||
| 1437 | /* Available with KVM_CAP_MANUAL_DIRTY_LOG_PROTECT */ | ||
| 1438 | #define KVM_CLEAR_DIRTY_LOG _IOWR(KVMIO, 0xc0, struct kvm_clear_dirty_log) | ||
| 1439 | |||
| 1440 | /* Available with KVM_CAP_HYPERV_CPUID */ | ||
| 1441 | #define KVM_GET_SUPPORTED_HV_CPUID _IOWR(KVMIO, 0xc1, struct kvm_cpuid2) | ||
| 1442 | |||
| 1424 | /* Secure Encrypted Virtualization command */ | 1443 | /* Secure Encrypted Virtualization command */ |
| 1425 | enum sev_cmd_id { | 1444 | enum sev_cmd_id { |
| 1426 | /* Guest initialization commands */ | 1445 | /* Guest initialization commands */ |
diff --git a/tools/kvm/kvm_stat/kvm_stat b/tools/kvm/kvm_stat/kvm_stat index 195ba486640f..2ed395b817cb 100755 --- a/tools/kvm/kvm_stat/kvm_stat +++ b/tools/kvm/kvm_stat/kvm_stat | |||
| @@ -1,4 +1,4 @@ | |||
| 1 | #!/usr/bin/python | 1 | #!/usr/bin/env python3 |
| 2 | # | 2 | # |
| 3 | # top-like utility for displaying kvm statistics | 3 | # top-like utility for displaying kvm statistics |
| 4 | # | 4 | # |
diff --git a/tools/testing/selftests/android/Makefile b/tools/testing/selftests/android/Makefile index d9a725478375..72c25a3cb658 100644 --- a/tools/testing/selftests/android/Makefile +++ b/tools/testing/selftests/android/Makefile | |||
| @@ -6,7 +6,7 @@ TEST_PROGS := run.sh | |||
| 6 | 6 | ||
| 7 | include ../lib.mk | 7 | include ../lib.mk |
| 8 | 8 | ||
| 9 | all: khdr | 9 | all: |
| 10 | @for DIR in $(SUBDIRS); do \ | 10 | @for DIR in $(SUBDIRS); do \ |
| 11 | BUILD_TARGET=$(OUTPUT)/$$DIR; \ | 11 | BUILD_TARGET=$(OUTPUT)/$$DIR; \ |
| 12 | mkdir $$BUILD_TARGET -p; \ | 12 | mkdir $$BUILD_TARGET -p; \ |
diff --git a/tools/testing/selftests/futex/functional/Makefile b/tools/testing/selftests/futex/functional/Makefile index ad1eeb14fda7..30996306cabc 100644 --- a/tools/testing/selftests/futex/functional/Makefile +++ b/tools/testing/selftests/futex/functional/Makefile | |||
| @@ -19,6 +19,7 @@ TEST_GEN_FILES := \ | |||
| 19 | TEST_PROGS := run.sh | 19 | TEST_PROGS := run.sh |
| 20 | 20 | ||
| 21 | top_srcdir = ../../../../.. | 21 | top_srcdir = ../../../../.. |
| 22 | KSFT_KHDR_INSTALL := 1 | ||
| 22 | include ../../lib.mk | 23 | include ../../lib.mk |
| 23 | 24 | ||
| 24 | $(TEST_GEN_FILES): $(HEADERS) | 25 | $(TEST_GEN_FILES): $(HEADERS) |
diff --git a/tools/testing/selftests/gpio/Makefile b/tools/testing/selftests/gpio/Makefile index 46648427d537..07f572a1bd3f 100644 --- a/tools/testing/selftests/gpio/Makefile +++ b/tools/testing/selftests/gpio/Makefile | |||
| @@ -10,8 +10,6 @@ TEST_PROGS_EXTENDED := gpio-mockup-chardev | |||
| 10 | GPIODIR := $(realpath ../../../gpio) | 10 | GPIODIR := $(realpath ../../../gpio) |
| 11 | GPIOOBJ := gpio-utils.o | 11 | GPIOOBJ := gpio-utils.o |
| 12 | 12 | ||
| 13 | include ../lib.mk | ||
| 14 | |||
| 15 | all: $(TEST_PROGS_EXTENDED) | 13 | all: $(TEST_PROGS_EXTENDED) |
| 16 | 14 | ||
| 17 | override define CLEAN | 15 | override define CLEAN |
| @@ -19,7 +17,9 @@ override define CLEAN | |||
| 19 | $(MAKE) -C $(GPIODIR) OUTPUT=$(GPIODIR)/ clean | 17 | $(MAKE) -C $(GPIODIR) OUTPUT=$(GPIODIR)/ clean |
| 20 | endef | 18 | endef |
| 21 | 19 | ||
| 22 | $(TEST_PROGS_EXTENDED):| khdr | 20 | KSFT_KHDR_INSTALL := 1 |
| 21 | include ../lib.mk | ||
| 22 | |||
| 23 | $(TEST_PROGS_EXTENDED): $(GPIODIR)/$(GPIOOBJ) | 23 | $(TEST_PROGS_EXTENDED): $(GPIODIR)/$(GPIOOBJ) |
| 24 | 24 | ||
| 25 | $(GPIODIR)/$(GPIOOBJ): | 25 | $(GPIODIR)/$(GPIOOBJ): |
diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index 01a219229238..f9a0e9938480 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile | |||
| @@ -1,6 +1,7 @@ | |||
| 1 | all: | 1 | all: |
| 2 | 2 | ||
| 3 | top_srcdir = ../../../.. | 3 | top_srcdir = ../../../.. |
| 4 | KSFT_KHDR_INSTALL := 1 | ||
| 4 | UNAME_M := $(shell uname -m) | 5 | UNAME_M := $(shell uname -m) |
| 5 | 6 | ||
| 6 | LIBKVM = lib/assert.c lib/elf.c lib/io.c lib/kvm_util.c lib/ucall.c lib/sparsebit.c | 7 | LIBKVM = lib/assert.c lib/elf.c lib/io.c lib/kvm_util.c lib/ucall.c lib/sparsebit.c |
| @@ -14,9 +15,12 @@ TEST_GEN_PROGS_x86_64 += x86_64/vmx_tsc_adjust_test | |||
| 14 | TEST_GEN_PROGS_x86_64 += x86_64/cr4_cpuid_sync_test | 15 | TEST_GEN_PROGS_x86_64 += x86_64/cr4_cpuid_sync_test |
| 15 | TEST_GEN_PROGS_x86_64 += x86_64/state_test | 16 | TEST_GEN_PROGS_x86_64 += x86_64/state_test |
| 16 | TEST_GEN_PROGS_x86_64 += x86_64/evmcs_test | 17 | TEST_GEN_PROGS_x86_64 += x86_64/evmcs_test |
| 18 | TEST_GEN_PROGS_x86_64 += x86_64/hyperv_cpuid | ||
| 17 | TEST_GEN_PROGS_x86_64 += dirty_log_test | 19 | TEST_GEN_PROGS_x86_64 += dirty_log_test |
| 20 | TEST_GEN_PROGS_x86_64 += clear_dirty_log_test | ||
| 18 | 21 | ||
| 19 | TEST_GEN_PROGS_aarch64 += dirty_log_test | 22 | TEST_GEN_PROGS_aarch64 += dirty_log_test |
| 23 | TEST_GEN_PROGS_aarch64 += clear_dirty_log_test | ||
| 20 | 24 | ||
| 21 | TEST_GEN_PROGS += $(TEST_GEN_PROGS_$(UNAME_M)) | 25 | TEST_GEN_PROGS += $(TEST_GEN_PROGS_$(UNAME_M)) |
| 22 | LIBKVM += $(LIBKVM_$(UNAME_M)) | 26 | LIBKVM += $(LIBKVM_$(UNAME_M)) |
| @@ -44,7 +48,6 @@ $(OUTPUT)/libkvm.a: $(LIBKVM_OBJ) | |||
| 44 | 48 | ||
| 45 | all: $(STATIC_LIBS) | 49 | all: $(STATIC_LIBS) |
| 46 | $(TEST_GEN_PROGS): $(STATIC_LIBS) | 50 | $(TEST_GEN_PROGS): $(STATIC_LIBS) |
| 47 | $(STATIC_LIBS):| khdr | ||
| 48 | 51 | ||
| 49 | cscope: include_paths = $(LINUX_TOOL_INCLUDE) $(LINUX_HDR_PATH) include lib .. | 52 | cscope: include_paths = $(LINUX_TOOL_INCLUDE) $(LINUX_HDR_PATH) include lib .. |
| 50 | cscope: | 53 | cscope: |
diff --git a/tools/testing/selftests/kvm/clear_dirty_log_test.c b/tools/testing/selftests/kvm/clear_dirty_log_test.c new file mode 100644 index 000000000000..749336937d37 --- /dev/null +++ b/tools/testing/selftests/kvm/clear_dirty_log_test.c | |||
| @@ -0,0 +1,2 @@ | |||
| 1 | #define USE_CLEAR_DIRTY_LOG | ||
| 2 | #include "dirty_log_test.c" | ||
diff --git a/tools/testing/selftests/kvm/dirty_log_test.c b/tools/testing/selftests/kvm/dirty_log_test.c index aeff95a91b15..4715cfba20dc 100644 --- a/tools/testing/selftests/kvm/dirty_log_test.c +++ b/tools/testing/selftests/kvm/dirty_log_test.c | |||
| @@ -51,10 +51,17 @@ static uint64_t random_array[TEST_PAGES_PER_LOOP]; | |||
| 51 | static uint64_t iteration; | 51 | static uint64_t iteration; |
| 52 | 52 | ||
| 53 | /* | 53 | /* |
| 54 | * GPA offset of the testing memory slot. Must be bigger than | 54 | * Guest physical memory offset of the testing memory slot. |
| 55 | * DEFAULT_GUEST_PHY_PAGES. | 55 | * This will be set to the topmost valid physical address minus |
| 56 | * the test memory size. | ||
| 56 | */ | 57 | */ |
| 57 | static uint64_t guest_test_mem = DEFAULT_GUEST_TEST_MEM; | 58 | static uint64_t guest_test_phys_mem; |
| 59 | |||
| 60 | /* | ||
| 61 | * Guest virtual memory offset of the testing memory slot. | ||
| 62 | * Must not conflict with identity mapped test code. | ||
| 63 | */ | ||
| 64 | static uint64_t guest_test_virt_mem = DEFAULT_GUEST_TEST_MEM; | ||
| 58 | 65 | ||
| 59 | /* | 66 | /* |
| 60 | * Continuously write to the first 8 bytes of a random pages within | 67 | * Continuously write to the first 8 bytes of a random pages within |
| @@ -66,7 +73,7 @@ static void guest_code(void) | |||
| 66 | 73 | ||
| 67 | while (true) { | 74 | while (true) { |
| 68 | for (i = 0; i < TEST_PAGES_PER_LOOP; i++) { | 75 | for (i = 0; i < TEST_PAGES_PER_LOOP; i++) { |
| 69 | uint64_t addr = guest_test_mem; | 76 | uint64_t addr = guest_test_virt_mem; |
| 70 | addr += (READ_ONCE(random_array[i]) % guest_num_pages) | 77 | addr += (READ_ONCE(random_array[i]) % guest_num_pages) |
| 71 | * guest_page_size; | 78 | * guest_page_size; |
| 72 | addr &= ~(host_page_size - 1); | 79 | addr &= ~(host_page_size - 1); |
| @@ -209,12 +216,14 @@ static void vm_dirty_log_verify(unsigned long *bmap) | |||
| 209 | } | 216 | } |
| 210 | 217 | ||
| 211 | static struct kvm_vm *create_vm(enum vm_guest_mode mode, uint32_t vcpuid, | 218 | static struct kvm_vm *create_vm(enum vm_guest_mode mode, uint32_t vcpuid, |
| 212 | uint64_t extra_mem_pages, void *guest_code) | 219 | uint64_t extra_mem_pages, void *guest_code, |
| 220 | unsigned long type) | ||
| 213 | { | 221 | { |
| 214 | struct kvm_vm *vm; | 222 | struct kvm_vm *vm; |
| 215 | uint64_t extra_pg_pages = extra_mem_pages / 512 * 2; | 223 | uint64_t extra_pg_pages = extra_mem_pages / 512 * 2; |
| 216 | 224 | ||
| 217 | vm = vm_create(mode, DEFAULT_GUEST_PHY_PAGES + extra_pg_pages, O_RDWR); | 225 | vm = _vm_create(mode, DEFAULT_GUEST_PHY_PAGES + extra_pg_pages, |
| 226 | O_RDWR, type); | ||
| 218 | kvm_vm_elf_load(vm, program_invocation_name, 0, 0); | 227 | kvm_vm_elf_load(vm, program_invocation_name, 0, 0); |
| 219 | #ifdef __x86_64__ | 228 | #ifdef __x86_64__ |
| 220 | vm_create_irqchip(vm); | 229 | vm_create_irqchip(vm); |
| @@ -224,13 +233,14 @@ static struct kvm_vm *create_vm(enum vm_guest_mode mode, uint32_t vcpuid, | |||
| 224 | } | 233 | } |
| 225 | 234 | ||
| 226 | static void run_test(enum vm_guest_mode mode, unsigned long iterations, | 235 | static void run_test(enum vm_guest_mode mode, unsigned long iterations, |
| 227 | unsigned long interval, bool top_offset) | 236 | unsigned long interval, uint64_t phys_offset) |
| 228 | { | 237 | { |
| 229 | unsigned int guest_pa_bits, guest_page_shift; | 238 | unsigned int guest_pa_bits, guest_page_shift; |
| 230 | pthread_t vcpu_thread; | 239 | pthread_t vcpu_thread; |
| 231 | struct kvm_vm *vm; | 240 | struct kvm_vm *vm; |
| 232 | uint64_t max_gfn; | 241 | uint64_t max_gfn; |
| 233 | unsigned long *bmap; | 242 | unsigned long *bmap; |
| 243 | unsigned long type = 0; | ||
| 234 | 244 | ||
| 235 | switch (mode) { | 245 | switch (mode) { |
| 236 | case VM_MODE_P52V48_4K: | 246 | case VM_MODE_P52V48_4K: |
| @@ -241,6 +251,14 @@ static void run_test(enum vm_guest_mode mode, unsigned long iterations, | |||
| 241 | guest_pa_bits = 52; | 251 | guest_pa_bits = 52; |
| 242 | guest_page_shift = 16; | 252 | guest_page_shift = 16; |
| 243 | break; | 253 | break; |
| 254 | case VM_MODE_P48V48_4K: | ||
| 255 | guest_pa_bits = 48; | ||
| 256 | guest_page_shift = 12; | ||
| 257 | break; | ||
| 258 | case VM_MODE_P48V48_64K: | ||
| 259 | guest_pa_bits = 48; | ||
| 260 | guest_page_shift = 16; | ||
| 261 | break; | ||
| 244 | case VM_MODE_P40V48_4K: | 262 | case VM_MODE_P40V48_4K: |
| 245 | guest_pa_bits = 40; | 263 | guest_pa_bits = 40; |
| 246 | guest_page_shift = 12; | 264 | guest_page_shift = 12; |
| @@ -255,6 +273,19 @@ static void run_test(enum vm_guest_mode mode, unsigned long iterations, | |||
| 255 | 273 | ||
| 256 | DEBUG("Testing guest mode: %s\n", vm_guest_mode_string(mode)); | 274 | DEBUG("Testing guest mode: %s\n", vm_guest_mode_string(mode)); |
| 257 | 275 | ||
| 276 | #ifdef __x86_64__ | ||
| 277 | /* | ||
| 278 | * FIXME | ||
| 279 | * The x86_64 kvm selftests framework currently only supports a | ||
| 280 | * single PML4 which restricts the number of physical address | ||
| 281 | * bits we can change to 39. | ||
| 282 | */ | ||
| 283 | guest_pa_bits = 39; | ||
| 284 | #endif | ||
| 285 | #ifdef __aarch64__ | ||
| 286 | if (guest_pa_bits != 40) | ||
| 287 | type = KVM_VM_TYPE_ARM_IPA_SIZE(guest_pa_bits); | ||
| 288 | #endif | ||
| 258 | max_gfn = (1ul << (guest_pa_bits - guest_page_shift)) - 1; | 289 | max_gfn = (1ul << (guest_pa_bits - guest_page_shift)) - 1; |
| 259 | guest_page_size = (1ul << guest_page_shift); | 290 | guest_page_size = (1ul << guest_page_shift); |
| 260 | /* 1G of guest page sized pages */ | 291 | /* 1G of guest page sized pages */ |
| @@ -263,31 +294,41 @@ static void run_test(enum vm_guest_mode mode, unsigned long iterations, | |||
| 263 | host_num_pages = (guest_num_pages * guest_page_size) / host_page_size + | 294 | host_num_pages = (guest_num_pages * guest_page_size) / host_page_size + |
| 264 | !!((guest_num_pages * guest_page_size) % host_page_size); | 295 | !!((guest_num_pages * guest_page_size) % host_page_size); |
| 265 | 296 | ||
| 266 | if (top_offset) { | 297 | if (!phys_offset) { |
| 267 | guest_test_mem = (max_gfn - guest_num_pages) * guest_page_size; | 298 | guest_test_phys_mem = (max_gfn - guest_num_pages) * guest_page_size; |
| 268 | guest_test_mem &= ~(host_page_size - 1); | 299 | guest_test_phys_mem &= ~(host_page_size - 1); |
| 300 | } else { | ||
| 301 | guest_test_phys_mem = phys_offset; | ||
| 269 | } | 302 | } |
| 270 | 303 | ||
| 271 | DEBUG("guest test mem offset: 0x%lx\n", guest_test_mem); | 304 | DEBUG("guest physical test memory offset: 0x%lx\n", guest_test_phys_mem); |
| 272 | 305 | ||
| 273 | bmap = bitmap_alloc(host_num_pages); | 306 | bmap = bitmap_alloc(host_num_pages); |
| 274 | host_bmap_track = bitmap_alloc(host_num_pages); | 307 | host_bmap_track = bitmap_alloc(host_num_pages); |
| 275 | 308 | ||
| 276 | vm = create_vm(mode, VCPU_ID, guest_num_pages, guest_code); | 309 | vm = create_vm(mode, VCPU_ID, guest_num_pages, guest_code, type); |
| 310 | |||
| 311 | #ifdef USE_CLEAR_DIRTY_LOG | ||
| 312 | struct kvm_enable_cap cap = {}; | ||
| 313 | |||
| 314 | cap.cap = KVM_CAP_MANUAL_DIRTY_LOG_PROTECT; | ||
| 315 | cap.args[0] = 1; | ||
| 316 | vm_enable_cap(vm, &cap); | ||
| 317 | #endif | ||
| 277 | 318 | ||
| 278 | /* Add an extra memory slot for testing dirty logging */ | 319 | /* Add an extra memory slot for testing dirty logging */ |
| 279 | vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, | 320 | vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, |
| 280 | guest_test_mem, | 321 | guest_test_phys_mem, |
| 281 | TEST_MEM_SLOT_INDEX, | 322 | TEST_MEM_SLOT_INDEX, |
| 282 | guest_num_pages, | 323 | guest_num_pages, |
| 283 | KVM_MEM_LOG_DIRTY_PAGES); | 324 | KVM_MEM_LOG_DIRTY_PAGES); |
| 284 | 325 | ||
| 285 | /* Do 1:1 mapping for the dirty track memory slot */ | 326 | /* Do mapping for the dirty track memory slot */ |
| 286 | virt_map(vm, guest_test_mem, guest_test_mem, | 327 | virt_map(vm, guest_test_virt_mem, guest_test_phys_mem, |
| 287 | guest_num_pages * guest_page_size, 0); | 328 | guest_num_pages * guest_page_size, 0); |
| 288 | 329 | ||
| 289 | /* Cache the HVA pointer of the region */ | 330 | /* Cache the HVA pointer of the region */ |
| 290 | host_test_mem = addr_gpa2hva(vm, (vm_paddr_t)guest_test_mem); | 331 | host_test_mem = addr_gpa2hva(vm, (vm_paddr_t)guest_test_phys_mem); |
| 291 | 332 | ||
| 292 | #ifdef __x86_64__ | 333 | #ifdef __x86_64__ |
| 293 | vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid()); | 334 | vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid()); |
| @@ -299,7 +340,7 @@ static void run_test(enum vm_guest_mode mode, unsigned long iterations, | |||
| 299 | /* Export the shared variables to the guest */ | 340 | /* Export the shared variables to the guest */ |
| 300 | sync_global_to_guest(vm, host_page_size); | 341 | sync_global_to_guest(vm, host_page_size); |
| 301 | sync_global_to_guest(vm, guest_page_size); | 342 | sync_global_to_guest(vm, guest_page_size); |
| 302 | sync_global_to_guest(vm, guest_test_mem); | 343 | sync_global_to_guest(vm, guest_test_virt_mem); |
| 303 | sync_global_to_guest(vm, guest_num_pages); | 344 | sync_global_to_guest(vm, guest_num_pages); |
| 304 | 345 | ||
| 305 | /* Start the iterations */ | 346 | /* Start the iterations */ |
| @@ -316,6 +357,10 @@ static void run_test(enum vm_guest_mode mode, unsigned long iterations, | |||
| 316 | /* Give the vcpu thread some time to dirty some pages */ | 357 | /* Give the vcpu thread some time to dirty some pages */ |
| 317 | usleep(interval * 1000); | 358 | usleep(interval * 1000); |
| 318 | kvm_vm_get_dirty_log(vm, TEST_MEM_SLOT_INDEX, bmap); | 359 | kvm_vm_get_dirty_log(vm, TEST_MEM_SLOT_INDEX, bmap); |
| 360 | #ifdef USE_CLEAR_DIRTY_LOG | ||
| 361 | kvm_vm_clear_dirty_log(vm, TEST_MEM_SLOT_INDEX, bmap, 0, | ||
| 362 | DIV_ROUND_UP(host_num_pages, 64) * 64); | ||
| 363 | #endif | ||
| 319 | vm_dirty_log_verify(bmap); | 364 | vm_dirty_log_verify(bmap); |
| 320 | iteration++; | 365 | iteration++; |
| 321 | sync_global_to_guest(vm, iteration); | 366 | sync_global_to_guest(vm, iteration); |
| @@ -335,23 +380,16 @@ static void run_test(enum vm_guest_mode mode, unsigned long iterations, | |||
| 335 | kvm_vm_free(vm); | 380 | kvm_vm_free(vm); |
| 336 | } | 381 | } |
| 337 | 382 | ||
| 338 | static struct vm_guest_modes { | 383 | struct vm_guest_mode_params { |
| 339 | enum vm_guest_mode mode; | ||
| 340 | bool supported; | 384 | bool supported; |
| 341 | bool enabled; | 385 | bool enabled; |
| 342 | } vm_guest_modes[NUM_VM_MODES] = { | ||
| 343 | #if defined(__x86_64__) | ||
| 344 | { VM_MODE_P52V48_4K, 1, 1, }, | ||
| 345 | { VM_MODE_P52V48_64K, 0, 0, }, | ||
| 346 | { VM_MODE_P40V48_4K, 0, 0, }, | ||
| 347 | { VM_MODE_P40V48_64K, 0, 0, }, | ||
| 348 | #elif defined(__aarch64__) | ||
| 349 | { VM_MODE_P52V48_4K, 0, 0, }, | ||
| 350 | { VM_MODE_P52V48_64K, 0, 0, }, | ||
| 351 | { VM_MODE_P40V48_4K, 1, 1, }, | ||
| 352 | { VM_MODE_P40V48_64K, 1, 1, }, | ||
| 353 | #endif | ||
| 354 | }; | 386 | }; |
| 387 | struct vm_guest_mode_params vm_guest_mode_params[NUM_VM_MODES]; | ||
| 388 | |||
| 389 | #define vm_guest_mode_params_init(mode, supported, enabled) \ | ||
| 390 | ({ \ | ||
| 391 | vm_guest_mode_params[mode] = (struct vm_guest_mode_params){ supported, enabled }; \ | ||
| 392 | }) | ||
| 355 | 393 | ||
| 356 | static void help(char *name) | 394 | static void help(char *name) |
| 357 | { | 395 | { |
| @@ -359,25 +397,21 @@ static void help(char *name) | |||
| 359 | 397 | ||
| 360 | puts(""); | 398 | puts(""); |
| 361 | printf("usage: %s [-h] [-i iterations] [-I interval] " | 399 | printf("usage: %s [-h] [-i iterations] [-I interval] " |
| 362 | "[-o offset] [-t] [-m mode]\n", name); | 400 | "[-p offset] [-m mode]\n", name); |
| 363 | puts(""); | 401 | puts(""); |
| 364 | printf(" -i: specify iteration counts (default: %"PRIu64")\n", | 402 | printf(" -i: specify iteration counts (default: %"PRIu64")\n", |
| 365 | TEST_HOST_LOOP_N); | 403 | TEST_HOST_LOOP_N); |
| 366 | printf(" -I: specify interval in ms (default: %"PRIu64" ms)\n", | 404 | printf(" -I: specify interval in ms (default: %"PRIu64" ms)\n", |
| 367 | TEST_HOST_LOOP_INTERVAL); | 405 | TEST_HOST_LOOP_INTERVAL); |
| 368 | printf(" -o: guest test memory offset (default: 0x%lx)\n", | 406 | printf(" -p: specify guest physical test memory offset\n" |
| 369 | DEFAULT_GUEST_TEST_MEM); | 407 | " Warning: a low offset can conflict with the loaded test code.\n"); |
| 370 | printf(" -t: map guest test memory at the top of the allowed " | ||
| 371 | "physical address range\n"); | ||
| 372 | printf(" -m: specify the guest mode ID to test " | 408 | printf(" -m: specify the guest mode ID to test " |
| 373 | "(default: test all supported modes)\n" | 409 | "(default: test all supported modes)\n" |
| 374 | " This option may be used multiple times.\n" | 410 | " This option may be used multiple times.\n" |
| 375 | " Guest mode IDs:\n"); | 411 | " Guest mode IDs:\n"); |
| 376 | for (i = 0; i < NUM_VM_MODES; ++i) { | 412 | for (i = 0; i < NUM_VM_MODES; ++i) { |
| 377 | printf(" %d: %s%s\n", | 413 | printf(" %d: %s%s\n", i, vm_guest_mode_string(i), |
| 378 | vm_guest_modes[i].mode, | 414 | vm_guest_mode_params[i].supported ? " (supported)" : ""); |
| 379 | vm_guest_mode_string(vm_guest_modes[i].mode), | ||
| 380 | vm_guest_modes[i].supported ? " (supported)" : ""); | ||
| 381 | } | 415 | } |
| 382 | puts(""); | 416 | puts(""); |
| 383 | exit(0); | 417 | exit(0); |
| @@ -388,11 +422,34 @@ int main(int argc, char *argv[]) | |||
| 388 | unsigned long iterations = TEST_HOST_LOOP_N; | 422 | unsigned long iterations = TEST_HOST_LOOP_N; |
| 389 | unsigned long interval = TEST_HOST_LOOP_INTERVAL; | 423 | unsigned long interval = TEST_HOST_LOOP_INTERVAL; |
| 390 | bool mode_selected = false; | 424 | bool mode_selected = false; |
| 391 | bool top_offset = false; | 425 | uint64_t phys_offset = 0; |
| 392 | unsigned int mode; | 426 | unsigned int mode, host_ipa_limit; |
| 393 | int opt, i; | 427 | int opt, i; |
| 394 | 428 | ||
| 395 | while ((opt = getopt(argc, argv, "hi:I:o:tm:")) != -1) { | 429 | #ifdef USE_CLEAR_DIRTY_LOG |
| 430 | if (!kvm_check_cap(KVM_CAP_MANUAL_DIRTY_LOG_PROTECT)) { | ||
| 431 | fprintf(stderr, "KVM_CLEAR_DIRTY_LOG not available, skipping tests\n"); | ||
| 432 | exit(KSFT_SKIP); | ||
| 433 | } | ||
| 434 | #endif | ||
| 435 | |||
| 436 | #ifdef __x86_64__ | ||
| 437 | vm_guest_mode_params_init(VM_MODE_P52V48_4K, true, true); | ||
| 438 | #endif | ||
| 439 | #ifdef __aarch64__ | ||
| 440 | vm_guest_mode_params_init(VM_MODE_P40V48_4K, true, true); | ||
| 441 | vm_guest_mode_params_init(VM_MODE_P40V48_64K, true, true); | ||
| 442 | |||
| 443 | host_ipa_limit = kvm_check_cap(KVM_CAP_ARM_VM_IPA_SIZE); | ||
| 444 | if (host_ipa_limit >= 52) | ||
| 445 | vm_guest_mode_params_init(VM_MODE_P52V48_64K, true, true); | ||
| 446 | if (host_ipa_limit >= 48) { | ||
| 447 | vm_guest_mode_params_init(VM_MODE_P48V48_4K, true, true); | ||
| 448 | vm_guest_mode_params_init(VM_MODE_P48V48_64K, true, true); | ||
| 449 | } | ||
| 450 | #endif | ||
| 451 | |||
| 452 | while ((opt = getopt(argc, argv, "hi:I:p:m:")) != -1) { | ||
| 396 | switch (opt) { | 453 | switch (opt) { |
| 397 | case 'i': | 454 | case 'i': |
| 398 | iterations = strtol(optarg, NULL, 10); | 455 | iterations = strtol(optarg, NULL, 10); |
| @@ -400,22 +457,19 @@ int main(int argc, char *argv[]) | |||
| 400 | case 'I': | 457 | case 'I': |
| 401 | interval = strtol(optarg, NULL, 10); | 458 | interval = strtol(optarg, NULL, 10); |
| 402 | break; | 459 | break; |
| 403 | case 'o': | 460 | case 'p': |
| 404 | guest_test_mem = strtoull(optarg, NULL, 0); | 461 | phys_offset = strtoull(optarg, NULL, 0); |
| 405 | break; | ||
| 406 | case 't': | ||
| 407 | top_offset = true; | ||
| 408 | break; | 462 | break; |
| 409 | case 'm': | 463 | case 'm': |
| 410 | if (!mode_selected) { | 464 | if (!mode_selected) { |
| 411 | for (i = 0; i < NUM_VM_MODES; ++i) | 465 | for (i = 0; i < NUM_VM_MODES; ++i) |
| 412 | vm_guest_modes[i].enabled = 0; | 466 | vm_guest_mode_params[i].enabled = false; |
| 413 | mode_selected = true; | 467 | mode_selected = true; |
| 414 | } | 468 | } |
| 415 | mode = strtoul(optarg, NULL, 10); | 469 | mode = strtoul(optarg, NULL, 10); |
| 416 | TEST_ASSERT(mode < NUM_VM_MODES, | 470 | TEST_ASSERT(mode < NUM_VM_MODES, |
| 417 | "Guest mode ID %d too big", mode); | 471 | "Guest mode ID %d too big", mode); |
| 418 | vm_guest_modes[mode].enabled = 1; | 472 | vm_guest_mode_params[mode].enabled = true; |
| 419 | break; | 473 | break; |
| 420 | case 'h': | 474 | case 'h': |
| 421 | default: | 475 | default: |
| @@ -426,8 +480,6 @@ int main(int argc, char *argv[]) | |||
| 426 | 480 | ||
| 427 | TEST_ASSERT(iterations > 2, "Iterations must be greater than two"); | 481 | TEST_ASSERT(iterations > 2, "Iterations must be greater than two"); |
| 428 | TEST_ASSERT(interval > 0, "Interval must be greater than zero"); | 482 | TEST_ASSERT(interval > 0, "Interval must be greater than zero"); |
| 429 | TEST_ASSERT(!top_offset || guest_test_mem == DEFAULT_GUEST_TEST_MEM, | ||
| 430 | "Cannot use both -o [offset] and -t at the same time"); | ||
| 431 | 483 | ||
| 432 | DEBUG("Test iterations: %"PRIu64", interval: %"PRIu64" (ms)\n", | 484 | DEBUG("Test iterations: %"PRIu64", interval: %"PRIu64" (ms)\n", |
| 433 | iterations, interval); | 485 | iterations, interval); |
| @@ -435,13 +487,12 @@ int main(int argc, char *argv[]) | |||
| 435 | srandom(time(0)); | 487 | srandom(time(0)); |
| 436 | 488 | ||
| 437 | for (i = 0; i < NUM_VM_MODES; ++i) { | 489 | for (i = 0; i < NUM_VM_MODES; ++i) { |
| 438 | if (!vm_guest_modes[i].enabled) | 490 | if (!vm_guest_mode_params[i].enabled) |
| 439 | continue; | 491 | continue; |
| 440 | TEST_ASSERT(vm_guest_modes[i].supported, | 492 | TEST_ASSERT(vm_guest_mode_params[i].supported, |
| 441 | "Guest mode ID %d (%s) not supported.", | 493 | "Guest mode ID %d (%s) not supported.", |
| 442 | vm_guest_modes[i].mode, | 494 | i, vm_guest_mode_string(i)); |
| 443 | vm_guest_mode_string(vm_guest_modes[i].mode)); | 495 | run_test(i, iterations, interval, phys_offset); |
| 444 | run_test(vm_guest_modes[i].mode, iterations, interval, top_offset); | ||
| 445 | } | 496 | } |
| 446 | 497 | ||
| 447 | return 0; | 498 | return 0; |
diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h index a4e59e3b4826..a84785b02557 100644 --- a/tools/testing/selftests/kvm/include/kvm_util.h +++ b/tools/testing/selftests/kvm/include/kvm_util.h | |||
| @@ -36,6 +36,8 @@ typedef uint64_t vm_vaddr_t; /* Virtual Machine (Guest) virtual address */ | |||
| 36 | enum vm_guest_mode { | 36 | enum vm_guest_mode { |
| 37 | VM_MODE_P52V48_4K, | 37 | VM_MODE_P52V48_4K, |
| 38 | VM_MODE_P52V48_64K, | 38 | VM_MODE_P52V48_64K, |
| 39 | VM_MODE_P48V48_4K, | ||
| 40 | VM_MODE_P48V48_64K, | ||
| 39 | VM_MODE_P40V48_4K, | 41 | VM_MODE_P40V48_4K, |
| 40 | VM_MODE_P40V48_64K, | 42 | VM_MODE_P40V48_64K, |
| 41 | NUM_VM_MODES, | 43 | NUM_VM_MODES, |
| @@ -54,10 +56,14 @@ int kvm_check_cap(long cap); | |||
| 54 | int vm_enable_cap(struct kvm_vm *vm, struct kvm_enable_cap *cap); | 56 | int vm_enable_cap(struct kvm_vm *vm, struct kvm_enable_cap *cap); |
| 55 | 57 | ||
| 56 | struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm); | 58 | struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm); |
| 59 | struct kvm_vm *_vm_create(enum vm_guest_mode mode, uint64_t phy_pages, | ||
| 60 | int perm, unsigned long type); | ||
| 57 | void kvm_vm_free(struct kvm_vm *vmp); | 61 | void kvm_vm_free(struct kvm_vm *vmp); |
| 58 | void kvm_vm_restart(struct kvm_vm *vmp, int perm); | 62 | void kvm_vm_restart(struct kvm_vm *vmp, int perm); |
| 59 | void kvm_vm_release(struct kvm_vm *vmp); | 63 | void kvm_vm_release(struct kvm_vm *vmp); |
| 60 | void kvm_vm_get_dirty_log(struct kvm_vm *vm, int slot, void *log); | 64 | void kvm_vm_get_dirty_log(struct kvm_vm *vm, int slot, void *log); |
| 65 | void kvm_vm_clear_dirty_log(struct kvm_vm *vm, int slot, void *log, | ||
| 66 | uint64_t first_page, uint32_t num_pages); | ||
| 61 | 67 | ||
| 62 | int kvm_memcmp_hva_gva(void *hva, struct kvm_vm *vm, const vm_vaddr_t gva, | 68 | int kvm_memcmp_hva_gva(void *hva, struct kvm_vm *vm, const vm_vaddr_t gva, |
| 63 | size_t len); | 69 | size_t len); |
| @@ -78,6 +84,8 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm, | |||
| 78 | 84 | ||
| 79 | void vcpu_ioctl(struct kvm_vm *vm, uint32_t vcpuid, unsigned long ioctl, | 85 | void vcpu_ioctl(struct kvm_vm *vm, uint32_t vcpuid, unsigned long ioctl, |
| 80 | void *arg); | 86 | void *arg); |
| 87 | int _vcpu_ioctl(struct kvm_vm *vm, uint32_t vcpuid, unsigned long ioctl, | ||
| 88 | void *arg); | ||
| 81 | void vm_ioctl(struct kvm_vm *vm, unsigned long ioctl, void *arg); | 89 | void vm_ioctl(struct kvm_vm *vm, unsigned long ioctl, void *arg); |
| 82 | void vm_mem_region_set_flags(struct kvm_vm *vm, uint32_t slot, uint32_t flags); | 90 | void vm_mem_region_set_flags(struct kvm_vm *vm, uint32_t slot, uint32_t flags); |
| 83 | void vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpuid, int pgd_memslot, | 91 | void vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpuid, int pgd_memslot, |
diff --git a/tools/testing/selftests/kvm/lib/aarch64/processor.c b/tools/testing/selftests/kvm/lib/aarch64/processor.c index b6022e2f116e..e8c42506a09d 100644 --- a/tools/testing/selftests/kvm/lib/aarch64/processor.c +++ b/tools/testing/selftests/kvm/lib/aarch64/processor.c | |||
| @@ -268,13 +268,20 @@ void vcpu_setup(struct kvm_vm *vm, int vcpuid, int pgd_memslot, int gdt_memslot) | |||
| 268 | 268 | ||
| 269 | switch (vm->mode) { | 269 | switch (vm->mode) { |
| 270 | case VM_MODE_P52V48_4K: | 270 | case VM_MODE_P52V48_4K: |
| 271 | tcr_el1 |= 0ul << 14; /* TG0 = 4KB */ | 271 | TEST_ASSERT(false, "AArch64 does not support 4K sized pages " |
| 272 | tcr_el1 |= 6ul << 32; /* IPS = 52 bits */ | 272 | "with 52-bit physical address ranges"); |
| 273 | break; | ||
| 274 | case VM_MODE_P52V48_64K: | 273 | case VM_MODE_P52V48_64K: |
| 275 | tcr_el1 |= 1ul << 14; /* TG0 = 64KB */ | 274 | tcr_el1 |= 1ul << 14; /* TG0 = 64KB */ |
| 276 | tcr_el1 |= 6ul << 32; /* IPS = 52 bits */ | 275 | tcr_el1 |= 6ul << 32; /* IPS = 52 bits */ |
| 277 | break; | 276 | break; |
| 277 | case VM_MODE_P48V48_4K: | ||
| 278 | tcr_el1 |= 0ul << 14; /* TG0 = 4KB */ | ||
| 279 | tcr_el1 |= 5ul << 32; /* IPS = 48 bits */ | ||
| 280 | break; | ||
| 281 | case VM_MODE_P48V48_64K: | ||
| 282 | tcr_el1 |= 1ul << 14; /* TG0 = 64KB */ | ||
| 283 | tcr_el1 |= 5ul << 32; /* IPS = 48 bits */ | ||
| 284 | break; | ||
| 278 | case VM_MODE_P40V48_4K: | 285 | case VM_MODE_P40V48_4K: |
| 279 | tcr_el1 |= 0ul << 14; /* TG0 = 4KB */ | 286 | tcr_el1 |= 0ul << 14; /* TG0 = 4KB */ |
| 280 | tcr_el1 |= 2ul << 32; /* IPS = 40 bits */ | 287 | tcr_el1 |= 2ul << 32; /* IPS = 40 bits */ |
| @@ -305,7 +312,6 @@ void vcpu_dump(FILE *stream, struct kvm_vm *vm, uint32_t vcpuid, uint8_t indent) | |||
| 305 | get_reg(vm, vcpuid, ARM64_CORE_REG(regs.pstate), &pstate); | 312 | get_reg(vm, vcpuid, ARM64_CORE_REG(regs.pstate), &pstate); |
| 306 | get_reg(vm, vcpuid, ARM64_CORE_REG(regs.pc), &pc); | 313 | get_reg(vm, vcpuid, ARM64_CORE_REG(regs.pc), &pc); |
| 307 | 314 | ||
| 308 | fprintf(stream, "%*spstate: 0x%.16llx pc: 0x%.16llx\n", | 315 | fprintf(stream, "%*spstate: 0x%.16llx pc: 0x%.16llx\n", |
| 309 | indent, "", pstate, pc); | 316 | indent, "", pstate, pc); |
| 310 | |||
| 311 | } | 317 | } |
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 1b41e71283d5..23022e9d32eb 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c | |||
| @@ -85,13 +85,13 @@ int vm_enable_cap(struct kvm_vm *vm, struct kvm_enable_cap *cap) | |||
| 85 | return ret; | 85 | return ret; |
| 86 | } | 86 | } |
| 87 | 87 | ||
| 88 | static void vm_open(struct kvm_vm *vm, int perm) | 88 | static void vm_open(struct kvm_vm *vm, int perm, unsigned long type) |
| 89 | { | 89 | { |
| 90 | vm->kvm_fd = open(KVM_DEV_PATH, perm); | 90 | vm->kvm_fd = open(KVM_DEV_PATH, perm); |
| 91 | if (vm->kvm_fd < 0) | 91 | if (vm->kvm_fd < 0) |
| 92 | exit(KSFT_SKIP); | 92 | exit(KSFT_SKIP); |
| 93 | 93 | ||
| 94 | vm->fd = ioctl(vm->kvm_fd, KVM_CREATE_VM, NULL); | 94 | vm->fd = ioctl(vm->kvm_fd, KVM_CREATE_VM, type); |
| 95 | TEST_ASSERT(vm->fd >= 0, "KVM_CREATE_VM ioctl failed, " | 95 | TEST_ASSERT(vm->fd >= 0, "KVM_CREATE_VM ioctl failed, " |
| 96 | "rc: %i errno: %i", vm->fd, errno); | 96 | "rc: %i errno: %i", vm->fd, errno); |
| 97 | } | 97 | } |
| @@ -99,9 +99,13 @@ static void vm_open(struct kvm_vm *vm, int perm) | |||
| 99 | const char * const vm_guest_mode_string[] = { | 99 | const char * const vm_guest_mode_string[] = { |
| 100 | "PA-bits:52, VA-bits:48, 4K pages", | 100 | "PA-bits:52, VA-bits:48, 4K pages", |
| 101 | "PA-bits:52, VA-bits:48, 64K pages", | 101 | "PA-bits:52, VA-bits:48, 64K pages", |
| 102 | "PA-bits:48, VA-bits:48, 4K pages", | ||
| 103 | "PA-bits:48, VA-bits:48, 64K pages", | ||
| 102 | "PA-bits:40, VA-bits:48, 4K pages", | 104 | "PA-bits:40, VA-bits:48, 4K pages", |
| 103 | "PA-bits:40, VA-bits:48, 64K pages", | 105 | "PA-bits:40, VA-bits:48, 64K pages", |
| 104 | }; | 106 | }; |
| 107 | _Static_assert(sizeof(vm_guest_mode_string)/sizeof(char *) == NUM_VM_MODES, | ||
| 108 | "Missing new mode strings?"); | ||
| 105 | 109 | ||
| 106 | /* | 110 | /* |
| 107 | * VM Create | 111 | * VM Create |
| @@ -122,7 +126,8 @@ const char * const vm_guest_mode_string[] = { | |||
| 122 | * descriptor to control the created VM is created with the permissions | 126 | * descriptor to control the created VM is created with the permissions |
| 123 | * given by perm (e.g. O_RDWR). | 127 | * given by perm (e.g. O_RDWR). |
| 124 | */ | 128 | */ |
| 125 | struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm) | 129 | struct kvm_vm *_vm_create(enum vm_guest_mode mode, uint64_t phy_pages, |
| 130 | int perm, unsigned long type) | ||
| 126 | { | 131 | { |
| 127 | struct kvm_vm *vm; | 132 | struct kvm_vm *vm; |
| 128 | int kvm_fd; | 133 | int kvm_fd; |
| @@ -131,22 +136,38 @@ struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm) | |||
| 131 | TEST_ASSERT(vm != NULL, "Insufficient Memory"); | 136 | TEST_ASSERT(vm != NULL, "Insufficient Memory"); |
| 132 | 137 | ||
| 133 | vm->mode = mode; | 138 | vm->mode = mode; |
| 134 | vm_open(vm, perm); | 139 | vm->type = type; |
| 140 | vm_open(vm, perm, type); | ||
| 135 | 141 | ||
| 136 | /* Setup mode specific traits. */ | 142 | /* Setup mode specific traits. */ |
| 137 | switch (vm->mode) { | 143 | switch (vm->mode) { |
| 138 | case VM_MODE_P52V48_4K: | 144 | case VM_MODE_P52V48_4K: |
| 139 | vm->pgtable_levels = 4; | 145 | vm->pgtable_levels = 4; |
| 146 | vm->pa_bits = 52; | ||
| 147 | vm->va_bits = 48; | ||
| 140 | vm->page_size = 0x1000; | 148 | vm->page_size = 0x1000; |
| 141 | vm->page_shift = 12; | 149 | vm->page_shift = 12; |
| 142 | vm->va_bits = 48; | ||
| 143 | break; | 150 | break; |
| 144 | case VM_MODE_P52V48_64K: | 151 | case VM_MODE_P52V48_64K: |
| 145 | vm->pgtable_levels = 3; | 152 | vm->pgtable_levels = 3; |
| 146 | vm->pa_bits = 52; | 153 | vm->pa_bits = 52; |
| 154 | vm->va_bits = 48; | ||
| 147 | vm->page_size = 0x10000; | 155 | vm->page_size = 0x10000; |
| 148 | vm->page_shift = 16; | 156 | vm->page_shift = 16; |
| 157 | break; | ||
| 158 | case VM_MODE_P48V48_4K: | ||
| 159 | vm->pgtable_levels = 4; | ||
| 160 | vm->pa_bits = 48; | ||
| 161 | vm->va_bits = 48; | ||
| 162 | vm->page_size = 0x1000; | ||
| 163 | vm->page_shift = 12; | ||
| 164 | break; | ||
| 165 | case VM_MODE_P48V48_64K: | ||
| 166 | vm->pgtable_levels = 3; | ||
| 167 | vm->pa_bits = 48; | ||
| 149 | vm->va_bits = 48; | 168 | vm->va_bits = 48; |
| 169 | vm->page_size = 0x10000; | ||
| 170 | vm->page_shift = 16; | ||
| 150 | break; | 171 | break; |
| 151 | case VM_MODE_P40V48_4K: | 172 | case VM_MODE_P40V48_4K: |
| 152 | vm->pgtable_levels = 4; | 173 | vm->pgtable_levels = 4; |
| @@ -186,6 +207,11 @@ struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm) | |||
| 186 | return vm; | 207 | return vm; |
| 187 | } | 208 | } |
| 188 | 209 | ||
| 210 | struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm) | ||
| 211 | { | ||
| 212 | return _vm_create(mode, phy_pages, perm, 0); | ||
| 213 | } | ||
| 214 | |||
| 189 | /* | 215 | /* |
| 190 | * VM Restart | 216 | * VM Restart |
| 191 | * | 217 | * |
| @@ -203,7 +229,7 @@ void kvm_vm_restart(struct kvm_vm *vmp, int perm) | |||
| 203 | { | 229 | { |
| 204 | struct userspace_mem_region *region; | 230 | struct userspace_mem_region *region; |
| 205 | 231 | ||
| 206 | vm_open(vmp, perm); | 232 | vm_open(vmp, perm, vmp->type); |
| 207 | if (vmp->has_irqchip) | 233 | if (vmp->has_irqchip) |
| 208 | vm_create_irqchip(vmp); | 234 | vm_create_irqchip(vmp); |
| 209 | 235 | ||
| @@ -231,6 +257,19 @@ void kvm_vm_get_dirty_log(struct kvm_vm *vm, int slot, void *log) | |||
| 231 | strerror(-ret)); | 257 | strerror(-ret)); |
| 232 | } | 258 | } |
| 233 | 259 | ||
| 260 | void kvm_vm_clear_dirty_log(struct kvm_vm *vm, int slot, void *log, | ||
| 261 | uint64_t first_page, uint32_t num_pages) | ||
| 262 | { | ||
| 263 | struct kvm_clear_dirty_log args = { .dirty_bitmap = log, .slot = slot, | ||
| 264 | .first_page = first_page, | ||
| 265 | .num_pages = num_pages }; | ||
| 266 | int ret; | ||
| 267 | |||
| 268 | ret = ioctl(vm->fd, KVM_CLEAR_DIRTY_LOG, &args); | ||
| 269 | TEST_ASSERT(ret == 0, "%s: KVM_CLEAR_DIRTY_LOG failed: %s", | ||
| 270 | strerror(-ret)); | ||
| 271 | } | ||
| 272 | |||
| 234 | /* | 273 | /* |
| 235 | * Userspace Memory Region Find | 274 | * Userspace Memory Region Find |
| 236 | * | 275 | * |
| @@ -1270,14 +1309,24 @@ int _vcpu_sregs_set(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_sregs *sregs) | |||
| 1270 | void vcpu_ioctl(struct kvm_vm *vm, uint32_t vcpuid, | 1309 | void vcpu_ioctl(struct kvm_vm *vm, uint32_t vcpuid, |
| 1271 | unsigned long cmd, void *arg) | 1310 | unsigned long cmd, void *arg) |
| 1272 | { | 1311 | { |
| 1312 | int ret; | ||
| 1313 | |||
| 1314 | ret = _vcpu_ioctl(vm, vcpuid, cmd, arg); | ||
| 1315 | TEST_ASSERT(ret == 0, "vcpu ioctl %lu failed, rc: %i errno: %i (%s)", | ||
| 1316 | cmd, ret, errno, strerror(errno)); | ||
| 1317 | } | ||
| 1318 | |||
| 1319 | int _vcpu_ioctl(struct kvm_vm *vm, uint32_t vcpuid, | ||
| 1320 | unsigned long cmd, void *arg) | ||
| 1321 | { | ||
| 1273 | struct vcpu *vcpu = vcpu_find(vm, vcpuid); | 1322 | struct vcpu *vcpu = vcpu_find(vm, vcpuid); |
| 1274 | int ret; | 1323 | int ret; |
| 1275 | 1324 | ||
| 1276 | TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); | 1325 | TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); |
| 1277 | 1326 | ||
| 1278 | ret = ioctl(vcpu->fd, cmd, arg); | 1327 | ret = ioctl(vcpu->fd, cmd, arg); |
| 1279 | TEST_ASSERT(ret == 0, "vcpu ioctl %lu failed, rc: %i errno: %i (%s)", | 1328 | |
| 1280 | cmd, ret, errno, strerror(errno)); | 1329 | return ret; |
| 1281 | } | 1330 | } |
| 1282 | 1331 | ||
| 1283 | /* | 1332 | /* |
| @@ -1422,7 +1471,7 @@ const char *exit_reason_str(unsigned int exit_reason) | |||
| 1422 | * | 1471 | * |
| 1423 | * Within the VM specified by vm, locates a range of available physical | 1472 | * Within the VM specified by vm, locates a range of available physical |
| 1424 | * pages at or above paddr_min. If found, the pages are marked as in use | 1473 | * pages at or above paddr_min. If found, the pages are marked as in use |
| 1425 | * and thier base address is returned. A TEST_ASSERT failure occurs if | 1474 | * and their base address is returned. A TEST_ASSERT failure occurs if |
| 1426 | * not enough pages are available at or above paddr_min. | 1475 | * not enough pages are available at or above paddr_min. |
| 1427 | */ | 1476 | */ |
| 1428 | vm_paddr_t vm_phy_pages_alloc(struct kvm_vm *vm, size_t num, | 1477 | vm_paddr_t vm_phy_pages_alloc(struct kvm_vm *vm, size_t num, |
diff --git a/tools/testing/selftests/kvm/lib/kvm_util_internal.h b/tools/testing/selftests/kvm/lib/kvm_util_internal.h index 52701db0f253..4595e42c6e29 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util_internal.h +++ b/tools/testing/selftests/kvm/lib/kvm_util_internal.h | |||
| @@ -44,6 +44,7 @@ struct vcpu { | |||
| 44 | 44 | ||
| 45 | struct kvm_vm { | 45 | struct kvm_vm { |
| 46 | int mode; | 46 | int mode; |
| 47 | unsigned long type; | ||
| 47 | int kvm_fd; | 48 | int kvm_fd; |
| 48 | int fd; | 49 | int fd; |
| 49 | unsigned int pgtable_levels; | 50 | unsigned int pgtable_levels; |
diff --git a/tools/testing/selftests/kvm/lib/ucall.c b/tools/testing/selftests/kvm/lib/ucall.c index 4777f9bb5194..a2ab38be2f47 100644 --- a/tools/testing/selftests/kvm/lib/ucall.c +++ b/tools/testing/selftests/kvm/lib/ucall.c | |||
| @@ -34,7 +34,8 @@ void ucall_init(struct kvm_vm *vm, ucall_type_t type, void *arg) | |||
| 34 | return; | 34 | return; |
| 35 | 35 | ||
| 36 | if (type == UCALL_MMIO) { | 36 | if (type == UCALL_MMIO) { |
| 37 | vm_paddr_t gpa, start, end, step; | 37 | vm_paddr_t gpa, start, end, step, offset; |
| 38 | unsigned bits; | ||
| 38 | bool ret; | 39 | bool ret; |
| 39 | 40 | ||
| 40 | if (arg) { | 41 | if (arg) { |
| @@ -45,25 +46,30 @@ void ucall_init(struct kvm_vm *vm, ucall_type_t type, void *arg) | |||
| 45 | } | 46 | } |
| 46 | 47 | ||
| 47 | /* | 48 | /* |
| 48 | * Find an address within the allowed virtual address space, | 49 | * Find an address within the allowed physical and virtual address |
| 49 | * that does _not_ have a KVM memory region associated with it. | 50 | * spaces, that does _not_ have a KVM memory region associated with |
| 50 | * Identity mapping an address like this allows the guest to | 51 | * it. Identity mapping an address like this allows the guest to |
| 51 | * access it, but as KVM doesn't know what to do with it, it | 52 | * access it, but as KVM doesn't know what to do with it, it |
| 52 | * will assume it's something userspace handles and exit with | 53 | * will assume it's something userspace handles and exit with |
| 53 | * KVM_EXIT_MMIO. Well, at least that's how it works for AArch64. | 54 | * KVM_EXIT_MMIO. Well, at least that's how it works for AArch64. |
| 54 | * Here we start with a guess that the addresses around two | 55 | * Here we start with a guess that the addresses around 5/8th |
| 55 | * thirds of the VA space are unmapped and then work both down | 56 | * of the allowed space are unmapped and then work both down and |
| 56 | * and up from there in 1/6 VA space sized steps. | 57 | * up from there in 1/16th allowed space sized steps. |
| 58 | * | ||
| 59 | * Note, we need to use VA-bits - 1 when calculating the allowed | ||
| 60 | * virtual address space for an identity mapping because the upper | ||
| 61 | * half of the virtual address space is the two's complement of the | ||
| 62 | * lower and won't match physical addresses. | ||
| 57 | */ | 63 | */ |
| 58 | start = 1ul << (vm->va_bits * 2 / 3); | 64 | bits = vm->va_bits - 1; |
| 59 | end = 1ul << vm->va_bits; | 65 | bits = vm->pa_bits < bits ? vm->pa_bits : bits; |
| 60 | step = 1ul << (vm->va_bits / 6); | 66 | end = 1ul << bits; |
| 61 | for (gpa = start; gpa >= 0; gpa -= step) { | 67 | start = end * 5 / 8; |
| 62 | if (ucall_mmio_init(vm, gpa & ~(vm->page_size - 1))) | 68 | step = end / 16; |
| 69 | for (offset = 0; offset < end - start; offset += step) { | ||
| 70 | if (ucall_mmio_init(vm, start - offset)) | ||
| 63 | return; | 71 | return; |
| 64 | } | 72 | if (ucall_mmio_init(vm, start + offset)) |
| 65 | for (gpa = start + step; gpa < end; gpa += step) { | ||
| 66 | if (ucall_mmio_init(vm, gpa & ~(vm->page_size - 1))) | ||
| 67 | return; | 73 | return; |
| 68 | } | 74 | } |
| 69 | TEST_ASSERT(false, "Can't find a ucall mmio address"); | 75 | TEST_ASSERT(false, "Can't find a ucall mmio address"); |
diff --git a/tools/testing/selftests/kvm/x86_64/evmcs_test.c b/tools/testing/selftests/kvm/x86_64/evmcs_test.c index 92c2cfd1b182..ea3c73e8f4f6 100644 --- a/tools/testing/selftests/kvm/x86_64/evmcs_test.c +++ b/tools/testing/selftests/kvm/x86_64/evmcs_test.c | |||
| @@ -113,8 +113,8 @@ int main(int argc, char *argv[]) | |||
| 113 | for (stage = 1;; stage++) { | 113 | for (stage = 1;; stage++) { |
| 114 | _vcpu_run(vm, VCPU_ID); | 114 | _vcpu_run(vm, VCPU_ID); |
| 115 | TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, | 115 | TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, |
| 116 | "Unexpected exit reason: %u (%s),\n", | 116 | "Stage %d: unexpected exit reason: %u (%s),\n", |
| 117 | run->exit_reason, | 117 | stage, run->exit_reason, |
| 118 | exit_reason_str(run->exit_reason)); | 118 | exit_reason_str(run->exit_reason)); |
| 119 | 119 | ||
| 120 | memset(®s1, 0, sizeof(regs1)); | 120 | memset(®s1, 0, sizeof(regs1)); |
diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c b/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c new file mode 100644 index 000000000000..264425f75806 --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c | |||
| @@ -0,0 +1,157 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0 | ||
| 2 | /* | ||
| 3 | * Test for x86 KVM_CAP_HYPERV_CPUID | ||
| 4 | * | ||
| 5 | * Copyright (C) 2018, Red Hat, Inc. | ||
| 6 | * | ||
| 7 | * This work is licensed under the terms of the GNU GPL, version 2. | ||
| 8 | * | ||
| 9 | */ | ||
| 10 | |||
| 11 | #define _GNU_SOURCE /* for program_invocation_short_name */ | ||
| 12 | #include <fcntl.h> | ||
| 13 | #include <stdio.h> | ||
| 14 | #include <stdlib.h> | ||
| 15 | #include <string.h> | ||
| 16 | #include <sys/ioctl.h> | ||
| 17 | |||
| 18 | #include "test_util.h" | ||
| 19 | #include "kvm_util.h" | ||
| 20 | #include "processor.h" | ||
| 21 | |||
| 22 | #define VCPU_ID 0 | ||
| 23 | |||
| 24 | static void guest_code(void) | ||
| 25 | { | ||
| 26 | } | ||
| 27 | |||
| 28 | static void test_hv_cpuid(struct kvm_cpuid2 *hv_cpuid_entries, | ||
| 29 | int evmcs_enabled) | ||
| 30 | { | ||
| 31 | int i; | ||
| 32 | |||
| 33 | if (!evmcs_enabled) | ||
| 34 | TEST_ASSERT(hv_cpuid_entries->nent == 6, | ||
| 35 | "KVM_GET_SUPPORTED_HV_CPUID should return 6 entries" | ||
| 36 | " when Enlightened VMCS is disabled (returned %d)", | ||
| 37 | hv_cpuid_entries->nent); | ||
| 38 | else | ||
| 39 | TEST_ASSERT(hv_cpuid_entries->nent == 7, | ||
| 40 | "KVM_GET_SUPPORTED_HV_CPUID should return 7 entries" | ||
| 41 | " when Enlightened VMCS is enabled (returned %d)", | ||
| 42 | hv_cpuid_entries->nent); | ||
| 43 | |||
| 44 | for (i = 0; i < hv_cpuid_entries->nent; i++) { | ||
| 45 | struct kvm_cpuid_entry2 *entry = &hv_cpuid_entries->entries[i]; | ||
| 46 | |||
| 47 | TEST_ASSERT((entry->function >= 0x40000000) && | ||
| 48 | (entry->function <= 0x4000000A), | ||
| 49 | "function %lx is our of supported range", | ||
| 50 | entry->function); | ||
| 51 | |||
| 52 | TEST_ASSERT(entry->index == 0, | ||
| 53 | ".index field should be zero"); | ||
| 54 | |||
| 55 | TEST_ASSERT(entry->index == 0, | ||
| 56 | ".index field should be zero"); | ||
| 57 | |||
| 58 | TEST_ASSERT(entry->flags == 0, | ||
| 59 | ".flags field should be zero"); | ||
| 60 | |||
| 61 | TEST_ASSERT(entry->padding[0] == entry->padding[1] | ||
| 62 | == entry->padding[2] == 0, | ||
| 63 | ".index field should be zero"); | ||
| 64 | |||
| 65 | /* | ||
| 66 | * If needed for debug: | ||
| 67 | * fprintf(stdout, | ||
| 68 | * "CPUID%lx EAX=0x%lx EBX=0x%lx ECX=0x%lx EDX=0x%lx\n", | ||
| 69 | * entry->function, entry->eax, entry->ebx, entry->ecx, | ||
| 70 | * entry->edx); | ||
| 71 | */ | ||
| 72 | } | ||
| 73 | |||
| 74 | } | ||
| 75 | |||
| 76 | void test_hv_cpuid_e2big(struct kvm_vm *vm) | ||
| 77 | { | ||
| 78 | static struct kvm_cpuid2 cpuid = {.nent = 0}; | ||
| 79 | int ret; | ||
| 80 | |||
| 81 | ret = _vcpu_ioctl(vm, VCPU_ID, KVM_GET_SUPPORTED_HV_CPUID, &cpuid); | ||
| 82 | |||
| 83 | TEST_ASSERT(ret == -1 && errno == E2BIG, | ||
| 84 | "KVM_GET_SUPPORTED_HV_CPUID didn't fail with -E2BIG when" | ||
| 85 | " it should have: %d %d", ret, errno); | ||
| 86 | } | ||
| 87 | |||
| 88 | |||
| 89 | struct kvm_cpuid2 *kvm_get_supported_hv_cpuid(struct kvm_vm *vm) | ||
| 90 | { | ||
| 91 | int nent = 20; /* should be enough */ | ||
| 92 | static struct kvm_cpuid2 *cpuid; | ||
| 93 | int ret; | ||
| 94 | |||
| 95 | cpuid = malloc(sizeof(*cpuid) + nent * sizeof(struct kvm_cpuid_entry2)); | ||
| 96 | |||
| 97 | if (!cpuid) { | ||
| 98 | perror("malloc"); | ||
| 99 | abort(); | ||
| 100 | } | ||
| 101 | |||
| 102 | cpuid->nent = nent; | ||
| 103 | |||
| 104 | vcpu_ioctl(vm, VCPU_ID, KVM_GET_SUPPORTED_HV_CPUID, cpuid); | ||
| 105 | |||
| 106 | return cpuid; | ||
| 107 | } | ||
| 108 | |||
| 109 | |||
| 110 | int main(int argc, char *argv[]) | ||
| 111 | { | ||
| 112 | struct kvm_vm *vm; | ||
| 113 | int rv; | ||
| 114 | uint16_t evmcs_ver; | ||
| 115 | struct kvm_cpuid2 *hv_cpuid_entries; | ||
| 116 | struct kvm_enable_cap enable_evmcs_cap = { | ||
| 117 | .cap = KVM_CAP_HYPERV_ENLIGHTENED_VMCS, | ||
| 118 | .args[0] = (unsigned long)&evmcs_ver | ||
| 119 | }; | ||
| 120 | |||
| 121 | /* Tell stdout not to buffer its content */ | ||
| 122 | setbuf(stdout, NULL); | ||
| 123 | |||
| 124 | rv = kvm_check_cap(KVM_CAP_HYPERV_CPUID); | ||
| 125 | if (!rv) { | ||
| 126 | fprintf(stderr, | ||
| 127 | "KVM_CAP_HYPERV_CPUID not supported, skip test\n"); | ||
| 128 | exit(KSFT_SKIP); | ||
| 129 | } | ||
| 130 | |||
| 131 | /* Create VM */ | ||
| 132 | vm = vm_create_default(VCPU_ID, 0, guest_code); | ||
| 133 | |||
| 134 | test_hv_cpuid_e2big(vm); | ||
| 135 | |||
| 136 | hv_cpuid_entries = kvm_get_supported_hv_cpuid(vm); | ||
| 137 | if (!hv_cpuid_entries) | ||
| 138 | return 1; | ||
| 139 | |||
| 140 | test_hv_cpuid(hv_cpuid_entries, 0); | ||
| 141 | |||
| 142 | free(hv_cpuid_entries); | ||
| 143 | |||
| 144 | vcpu_ioctl(vm, VCPU_ID, KVM_ENABLE_CAP, &enable_evmcs_cap); | ||
| 145 | |||
| 146 | hv_cpuid_entries = kvm_get_supported_hv_cpuid(vm); | ||
| 147 | if (!hv_cpuid_entries) | ||
| 148 | return 1; | ||
| 149 | |||
| 150 | test_hv_cpuid(hv_cpuid_entries, 1); | ||
| 151 | |||
| 152 | free(hv_cpuid_entries); | ||
| 153 | |||
| 154 | kvm_vm_free(vm); | ||
| 155 | |||
| 156 | return 0; | ||
| 157 | } | ||
diff --git a/tools/testing/selftests/kvm/x86_64/state_test.c b/tools/testing/selftests/kvm/x86_64/state_test.c index 03da41f0f736..4b3f556265f1 100644 --- a/tools/testing/selftests/kvm/x86_64/state_test.c +++ b/tools/testing/selftests/kvm/x86_64/state_test.c | |||
| @@ -152,8 +152,8 @@ int main(int argc, char *argv[]) | |||
| 152 | for (stage = 1;; stage++) { | 152 | for (stage = 1;; stage++) { |
| 153 | _vcpu_run(vm, VCPU_ID); | 153 | _vcpu_run(vm, VCPU_ID); |
| 154 | TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, | 154 | TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, |
| 155 | "Unexpected exit reason: %u (%s),\n", | 155 | "Stage %d: unexpected exit reason: %u (%s),\n", |
| 156 | run->exit_reason, | 156 | stage, run->exit_reason, |
| 157 | exit_reason_str(run->exit_reason)); | 157 | exit_reason_str(run->exit_reason)); |
| 158 | 158 | ||
| 159 | memset(®s1, 0, sizeof(regs1)); | 159 | memset(®s1, 0, sizeof(regs1)); |
diff --git a/tools/testing/selftests/lib.mk b/tools/testing/selftests/lib.mk index 0a8e75886224..8b0f16409ed7 100644 --- a/tools/testing/selftests/lib.mk +++ b/tools/testing/selftests/lib.mk | |||
| @@ -16,18 +16,18 @@ TEST_GEN_PROGS := $(patsubst %,$(OUTPUT)/%,$(TEST_GEN_PROGS)) | |||
| 16 | TEST_GEN_PROGS_EXTENDED := $(patsubst %,$(OUTPUT)/%,$(TEST_GEN_PROGS_EXTENDED)) | 16 | TEST_GEN_PROGS_EXTENDED := $(patsubst %,$(OUTPUT)/%,$(TEST_GEN_PROGS_EXTENDED)) |
| 17 | TEST_GEN_FILES := $(patsubst %,$(OUTPUT)/%,$(TEST_GEN_FILES)) | 17 | TEST_GEN_FILES := $(patsubst %,$(OUTPUT)/%,$(TEST_GEN_FILES)) |
| 18 | 18 | ||
| 19 | ifdef KSFT_KHDR_INSTALL | ||
| 19 | top_srcdir ?= ../../../.. | 20 | top_srcdir ?= ../../../.. |
| 20 | include $(top_srcdir)/scripts/subarch.include | 21 | include $(top_srcdir)/scripts/subarch.include |
| 21 | ARCH ?= $(SUBARCH) | 22 | ARCH ?= $(SUBARCH) |
| 22 | 23 | ||
| 23 | all: $(TEST_GEN_PROGS) $(TEST_GEN_PROGS_EXTENDED) $(TEST_GEN_FILES) | ||
| 24 | |||
| 25 | .PHONY: khdr | 24 | .PHONY: khdr |
| 26 | khdr: | 25 | khdr: |
| 27 | make ARCH=$(ARCH) -C $(top_srcdir) headers_install | 26 | make ARCH=$(ARCH) -C $(top_srcdir) headers_install |
| 28 | 27 | ||
| 29 | ifdef KSFT_KHDR_INSTALL | 28 | all: khdr $(TEST_GEN_PROGS) $(TEST_GEN_PROGS_EXTENDED) $(TEST_GEN_FILES) |
| 30 | $(TEST_GEN_PROGS) $(TEST_GEN_PROGS_EXTENDED) $(TEST_GEN_FILES):| khdr | 29 | else |
| 30 | all: $(TEST_GEN_PROGS) $(TEST_GEN_PROGS_EXTENDED) $(TEST_GEN_FILES) | ||
| 31 | endif | 31 | endif |
| 32 | 32 | ||
| 33 | .ONESHELL: | 33 | .ONESHELL: |
diff --git a/tools/testing/selftests/networking/timestamping/Makefile b/tools/testing/selftests/networking/timestamping/Makefile index 14cfcf006936..c46c0eefab9e 100644 --- a/tools/testing/selftests/networking/timestamping/Makefile +++ b/tools/testing/selftests/networking/timestamping/Makefile | |||
| @@ -6,6 +6,7 @@ TEST_PROGS := hwtstamp_config rxtimestamp timestamping txtimestamp | |||
| 6 | all: $(TEST_PROGS) | 6 | all: $(TEST_PROGS) |
| 7 | 7 | ||
| 8 | top_srcdir = ../../../../.. | 8 | top_srcdir = ../../../../.. |
| 9 | KSFT_KHDR_INSTALL := 1 | ||
| 9 | include ../../lib.mk | 10 | include ../../lib.mk |
| 10 | 11 | ||
| 11 | clean: | 12 | clean: |
diff --git a/tools/testing/selftests/tc-testing/bpf/Makefile b/tools/testing/selftests/tc-testing/bpf/Makefile index dc92eb271d9a..be5a5e542804 100644 --- a/tools/testing/selftests/tc-testing/bpf/Makefile +++ b/tools/testing/selftests/tc-testing/bpf/Makefile | |||
| @@ -4,6 +4,7 @@ APIDIR := ../../../../include/uapi | |||
| 4 | TEST_GEN_FILES = action.o | 4 | TEST_GEN_FILES = action.o |
| 5 | 5 | ||
| 6 | top_srcdir = ../../../../.. | 6 | top_srcdir = ../../../../.. |
| 7 | KSFT_KHDR_INSTALL := 1 | ||
| 7 | include ../../lib.mk | 8 | include ../../lib.mk |
| 8 | 9 | ||
| 9 | CLANG ?= clang | 10 | CLANG ?= clang |
diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile index 6e67e726e5a5..e13eb6cc8901 100644 --- a/tools/testing/selftests/vm/Makefile +++ b/tools/testing/selftests/vm/Makefile | |||
| @@ -25,6 +25,7 @@ TEST_GEN_FILES += virtual_address_range | |||
| 25 | 25 | ||
| 26 | TEST_PROGS := run_vmtests | 26 | TEST_PROGS := run_vmtests |
| 27 | 27 | ||
| 28 | KSFT_KHDR_INSTALL := 1 | ||
| 28 | include ../lib.mk | 29 | include ../lib.mk |
| 29 | 30 | ||
| 30 | $(OUTPUT)/userfaultfd: LDLIBS += -lpthread | 31 | $(OUTPUT)/userfaultfd: LDLIBS += -lpthread |
diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c index 17cecc96f735..b07ac4614e1c 100644 --- a/virt/kvm/arm/arch_timer.c +++ b/virt/kvm/arm/arch_timer.c | |||
| @@ -70,11 +70,9 @@ static void soft_timer_start(struct hrtimer *hrt, u64 ns) | |||
| 70 | HRTIMER_MODE_ABS); | 70 | HRTIMER_MODE_ABS); |
| 71 | } | 71 | } |
| 72 | 72 | ||
| 73 | static void soft_timer_cancel(struct hrtimer *hrt, struct work_struct *work) | 73 | static void soft_timer_cancel(struct hrtimer *hrt) |
| 74 | { | 74 | { |
| 75 | hrtimer_cancel(hrt); | 75 | hrtimer_cancel(hrt); |
| 76 | if (work) | ||
| 77 | cancel_work_sync(work); | ||
| 78 | } | 76 | } |
| 79 | 77 | ||
| 80 | static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id) | 78 | static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id) |
| @@ -102,23 +100,6 @@ static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id) | |||
| 102 | return IRQ_HANDLED; | 100 | return IRQ_HANDLED; |
| 103 | } | 101 | } |
| 104 | 102 | ||
| 105 | /* | ||
| 106 | * Work function for handling the backup timer that we schedule when a vcpu is | ||
| 107 | * no longer running, but had a timer programmed to fire in the future. | ||
| 108 | */ | ||
| 109 | static void kvm_timer_inject_irq_work(struct work_struct *work) | ||
| 110 | { | ||
| 111 | struct kvm_vcpu *vcpu; | ||
| 112 | |||
| 113 | vcpu = container_of(work, struct kvm_vcpu, arch.timer_cpu.expired); | ||
| 114 | |||
| 115 | /* | ||
| 116 | * If the vcpu is blocked we want to wake it up so that it will see | ||
| 117 | * the timer has expired when entering the guest. | ||
| 118 | */ | ||
| 119 | kvm_vcpu_wake_up(vcpu); | ||
| 120 | } | ||
| 121 | |||
| 122 | static u64 kvm_timer_compute_delta(struct arch_timer_context *timer_ctx) | 103 | static u64 kvm_timer_compute_delta(struct arch_timer_context *timer_ctx) |
| 123 | { | 104 | { |
| 124 | u64 cval, now; | 105 | u64 cval, now; |
| @@ -188,7 +169,7 @@ static enum hrtimer_restart kvm_bg_timer_expire(struct hrtimer *hrt) | |||
| 188 | return HRTIMER_RESTART; | 169 | return HRTIMER_RESTART; |
| 189 | } | 170 | } |
| 190 | 171 | ||
| 191 | schedule_work(&timer->expired); | 172 | kvm_vcpu_wake_up(vcpu); |
| 192 | return HRTIMER_NORESTART; | 173 | return HRTIMER_NORESTART; |
| 193 | } | 174 | } |
| 194 | 175 | ||
| @@ -300,7 +281,7 @@ static void phys_timer_emulate(struct kvm_vcpu *vcpu) | |||
| 300 | * then we also don't need a soft timer. | 281 | * then we also don't need a soft timer. |
| 301 | */ | 282 | */ |
| 302 | if (kvm_timer_should_fire(ptimer) || !kvm_timer_irq_can_fire(ptimer)) { | 283 | if (kvm_timer_should_fire(ptimer) || !kvm_timer_irq_can_fire(ptimer)) { |
| 303 | soft_timer_cancel(&timer->phys_timer, NULL); | 284 | soft_timer_cancel(&timer->phys_timer); |
| 304 | return; | 285 | return; |
| 305 | } | 286 | } |
| 306 | 287 | ||
| @@ -426,7 +407,7 @@ void kvm_timer_unschedule(struct kvm_vcpu *vcpu) | |||
| 426 | 407 | ||
| 427 | vtimer_restore_state(vcpu); | 408 | vtimer_restore_state(vcpu); |
| 428 | 409 | ||
| 429 | soft_timer_cancel(&timer->bg_timer, &timer->expired); | 410 | soft_timer_cancel(&timer->bg_timer); |
| 430 | } | 411 | } |
| 431 | 412 | ||
| 432 | static void set_cntvoff(u64 cntvoff) | 413 | static void set_cntvoff(u64 cntvoff) |
| @@ -544,7 +525,7 @@ void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu) | |||
| 544 | * In any case, we re-schedule the hrtimer for the physical timer when | 525 | * In any case, we re-schedule the hrtimer for the physical timer when |
| 545 | * coming back to the VCPU thread in kvm_timer_vcpu_load(). | 526 | * coming back to the VCPU thread in kvm_timer_vcpu_load(). |
| 546 | */ | 527 | */ |
| 547 | soft_timer_cancel(&timer->phys_timer, NULL); | 528 | soft_timer_cancel(&timer->phys_timer); |
| 548 | 529 | ||
| 549 | /* | 530 | /* |
| 550 | * The kernel may decide to run userspace after calling vcpu_put, so | 531 | * The kernel may decide to run userspace after calling vcpu_put, so |
| @@ -637,7 +618,6 @@ void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu) | |||
| 637 | update_vtimer_cntvoff(vcpu, kvm_phys_timer_read()); | 618 | update_vtimer_cntvoff(vcpu, kvm_phys_timer_read()); |
| 638 | vcpu_ptimer(vcpu)->cntvoff = 0; | 619 | vcpu_ptimer(vcpu)->cntvoff = 0; |
| 639 | 620 | ||
| 640 | INIT_WORK(&timer->expired, kvm_timer_inject_irq_work); | ||
| 641 | hrtimer_init(&timer->bg_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); | 621 | hrtimer_init(&timer->bg_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); |
| 642 | timer->bg_timer.function = kvm_bg_timer_expire; | 622 | timer->bg_timer.function = kvm_bg_timer_expire; |
| 643 | 623 | ||
| @@ -792,11 +772,8 @@ out_free_irq: | |||
| 792 | void kvm_timer_vcpu_terminate(struct kvm_vcpu *vcpu) | 772 | void kvm_timer_vcpu_terminate(struct kvm_vcpu *vcpu) |
| 793 | { | 773 | { |
| 794 | struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; | 774 | struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; |
| 795 | struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); | ||
| 796 | 775 | ||
| 797 | soft_timer_cancel(&timer->bg_timer, &timer->expired); | 776 | soft_timer_cancel(&timer->bg_timer); |
| 798 | soft_timer_cancel(&timer->phys_timer, NULL); | ||
| 799 | kvm_vgic_unmap_phys_irq(vcpu, vtimer->irq.irq); | ||
| 800 | } | 777 | } |
| 801 | 778 | ||
| 802 | static bool timer_irqs_are_valid(struct kvm_vcpu *vcpu) | 779 | static bool timer_irqs_are_valid(struct kvm_vcpu *vcpu) |
diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index 36165748a315..9e350fd34504 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c | |||
| @@ -66,7 +66,7 @@ static DEFINE_PER_CPU(struct kvm_vcpu *, kvm_arm_running_vcpu); | |||
| 66 | static atomic64_t kvm_vmid_gen = ATOMIC64_INIT(1); | 66 | static atomic64_t kvm_vmid_gen = ATOMIC64_INIT(1); |
| 67 | static u32 kvm_next_vmid; | 67 | static u32 kvm_next_vmid; |
| 68 | static unsigned int kvm_vmid_bits __read_mostly; | 68 | static unsigned int kvm_vmid_bits __read_mostly; |
| 69 | static DEFINE_RWLOCK(kvm_vmid_lock); | 69 | static DEFINE_SPINLOCK(kvm_vmid_lock); |
| 70 | 70 | ||
| 71 | static bool vgic_present; | 71 | static bool vgic_present; |
| 72 | 72 | ||
| @@ -484,7 +484,9 @@ void force_vm_exit(const cpumask_t *mask) | |||
| 484 | */ | 484 | */ |
| 485 | static bool need_new_vmid_gen(struct kvm *kvm) | 485 | static bool need_new_vmid_gen(struct kvm *kvm) |
| 486 | { | 486 | { |
| 487 | return unlikely(kvm->arch.vmid_gen != atomic64_read(&kvm_vmid_gen)); | 487 | u64 current_vmid_gen = atomic64_read(&kvm_vmid_gen); |
| 488 | smp_rmb(); /* Orders read of kvm_vmid_gen and kvm->arch.vmid */ | ||
| 489 | return unlikely(READ_ONCE(kvm->arch.vmid_gen) != current_vmid_gen); | ||
| 488 | } | 490 | } |
| 489 | 491 | ||
| 490 | /** | 492 | /** |
| @@ -499,16 +501,11 @@ static void update_vttbr(struct kvm *kvm) | |||
| 499 | { | 501 | { |
| 500 | phys_addr_t pgd_phys; | 502 | phys_addr_t pgd_phys; |
| 501 | u64 vmid, cnp = kvm_cpu_has_cnp() ? VTTBR_CNP_BIT : 0; | 503 | u64 vmid, cnp = kvm_cpu_has_cnp() ? VTTBR_CNP_BIT : 0; |
| 502 | bool new_gen; | ||
| 503 | 504 | ||
| 504 | read_lock(&kvm_vmid_lock); | 505 | if (!need_new_vmid_gen(kvm)) |
| 505 | new_gen = need_new_vmid_gen(kvm); | ||
| 506 | read_unlock(&kvm_vmid_lock); | ||
| 507 | |||
| 508 | if (!new_gen) | ||
| 509 | return; | 506 | return; |
| 510 | 507 | ||
| 511 | write_lock(&kvm_vmid_lock); | 508 | spin_lock(&kvm_vmid_lock); |
| 512 | 509 | ||
| 513 | /* | 510 | /* |
| 514 | * We need to re-check the vmid_gen here to ensure that if another vcpu | 511 | * We need to re-check the vmid_gen here to ensure that if another vcpu |
| @@ -516,7 +513,7 @@ static void update_vttbr(struct kvm *kvm) | |||
| 516 | * use the same vmid. | 513 | * use the same vmid. |
| 517 | */ | 514 | */ |
| 518 | if (!need_new_vmid_gen(kvm)) { | 515 | if (!need_new_vmid_gen(kvm)) { |
| 519 | write_unlock(&kvm_vmid_lock); | 516 | spin_unlock(&kvm_vmid_lock); |
| 520 | return; | 517 | return; |
| 521 | } | 518 | } |
| 522 | 519 | ||
| @@ -539,7 +536,6 @@ static void update_vttbr(struct kvm *kvm) | |||
| 539 | kvm_call_hyp(__kvm_flush_vm_context); | 536 | kvm_call_hyp(__kvm_flush_vm_context); |
| 540 | } | 537 | } |
| 541 | 538 | ||
| 542 | kvm->arch.vmid_gen = atomic64_read(&kvm_vmid_gen); | ||
| 543 | kvm->arch.vmid = kvm_next_vmid; | 539 | kvm->arch.vmid = kvm_next_vmid; |
| 544 | kvm_next_vmid++; | 540 | kvm_next_vmid++; |
| 545 | kvm_next_vmid &= (1 << kvm_vmid_bits) - 1; | 541 | kvm_next_vmid &= (1 << kvm_vmid_bits) - 1; |
| @@ -550,7 +546,10 @@ static void update_vttbr(struct kvm *kvm) | |||
| 550 | vmid = ((u64)(kvm->arch.vmid) << VTTBR_VMID_SHIFT) & VTTBR_VMID_MASK(kvm_vmid_bits); | 546 | vmid = ((u64)(kvm->arch.vmid) << VTTBR_VMID_SHIFT) & VTTBR_VMID_MASK(kvm_vmid_bits); |
| 551 | kvm->arch.vttbr = kvm_phys_to_vttbr(pgd_phys) | vmid | cnp; | 547 | kvm->arch.vttbr = kvm_phys_to_vttbr(pgd_phys) | vmid | cnp; |
| 552 | 548 | ||
| 553 | write_unlock(&kvm_vmid_lock); | 549 | smp_wmb(); |
| 550 | WRITE_ONCE(kvm->arch.vmid_gen, atomic64_read(&kvm_vmid_gen)); | ||
| 551 | |||
| 552 | spin_unlock(&kvm_vmid_lock); | ||
| 554 | } | 553 | } |
| 555 | 554 | ||
| 556 | static int kvm_vcpu_first_run_init(struct kvm_vcpu *vcpu) | 555 | static int kvm_vcpu_first_run_init(struct kvm_vcpu *vcpu) |
| @@ -674,8 +673,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) | |||
| 674 | ret = kvm_handle_mmio_return(vcpu, vcpu->run); | 673 | ret = kvm_handle_mmio_return(vcpu, vcpu->run); |
| 675 | if (ret) | 674 | if (ret) |
| 676 | return ret; | 675 | return ret; |
| 677 | if (kvm_arm_handle_step_debug(vcpu, vcpu->run)) | ||
| 678 | return 0; | ||
| 679 | } | 676 | } |
| 680 | 677 | ||
| 681 | if (run->immediate_exit) | 678 | if (run->immediate_exit) |
| @@ -1205,14 +1202,30 @@ long kvm_arch_vcpu_ioctl(struct file *filp, | |||
| 1205 | */ | 1202 | */ |
| 1206 | int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) | 1203 | int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) |
| 1207 | { | 1204 | { |
| 1208 | bool is_dirty = false; | 1205 | bool flush = false; |
| 1206 | int r; | ||
| 1207 | |||
| 1208 | mutex_lock(&kvm->slots_lock); | ||
| 1209 | |||
| 1210 | r = kvm_get_dirty_log_protect(kvm, log, &flush); | ||
| 1211 | |||
| 1212 | if (flush) | ||
| 1213 | kvm_flush_remote_tlbs(kvm); | ||
| 1214 | |||
| 1215 | mutex_unlock(&kvm->slots_lock); | ||
| 1216 | return r; | ||
| 1217 | } | ||
| 1218 | |||
| 1219 | int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm, struct kvm_clear_dirty_log *log) | ||
| 1220 | { | ||
| 1221 | bool flush = false; | ||
| 1209 | int r; | 1222 | int r; |
| 1210 | 1223 | ||
| 1211 | mutex_lock(&kvm->slots_lock); | 1224 | mutex_lock(&kvm->slots_lock); |
| 1212 | 1225 | ||
| 1213 | r = kvm_get_dirty_log_protect(kvm, log, &is_dirty); | 1226 | r = kvm_clear_dirty_log_protect(kvm, log, &flush); |
| 1214 | 1227 | ||
| 1215 | if (is_dirty) | 1228 | if (flush) |
| 1216 | kvm_flush_remote_tlbs(kvm); | 1229 | kvm_flush_remote_tlbs(kvm); |
| 1217 | 1230 | ||
| 1218 | mutex_unlock(&kvm->slots_lock); | 1231 | mutex_unlock(&kvm->slots_lock); |
diff --git a/virt/kvm/arm/hyp/vgic-v3-sr.c b/virt/kvm/arm/hyp/vgic-v3-sr.c index 616e5a433ab0..9652c453480f 100644 --- a/virt/kvm/arm/hyp/vgic-v3-sr.c +++ b/virt/kvm/arm/hyp/vgic-v3-sr.c | |||
| @@ -1012,8 +1012,10 @@ int __hyp_text __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu) | |||
| 1012 | 1012 | ||
| 1013 | esr = kvm_vcpu_get_hsr(vcpu); | 1013 | esr = kvm_vcpu_get_hsr(vcpu); |
| 1014 | if (vcpu_mode_is_32bit(vcpu)) { | 1014 | if (vcpu_mode_is_32bit(vcpu)) { |
| 1015 | if (!kvm_condition_valid(vcpu)) | 1015 | if (!kvm_condition_valid(vcpu)) { |
| 1016 | __kvm_skip_instr(vcpu); | ||
| 1016 | return 1; | 1017 | return 1; |
| 1018 | } | ||
| 1017 | 1019 | ||
| 1018 | sysreg = esr_cp15_to_sysreg(esr); | 1020 | sysreg = esr_cp15_to_sysreg(esr); |
| 1019 | } else { | 1021 | } else { |
| @@ -1123,6 +1125,8 @@ int __hyp_text __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu) | |||
| 1123 | rt = kvm_vcpu_sys_get_rt(vcpu); | 1125 | rt = kvm_vcpu_sys_get_rt(vcpu); |
| 1124 | fn(vcpu, vmcr, rt); | 1126 | fn(vcpu, vmcr, rt); |
| 1125 | 1127 | ||
| 1128 | __kvm_skip_instr(vcpu); | ||
| 1129 | |||
| 1126 | return 1; | 1130 | return 1; |
| 1127 | } | 1131 | } |
| 1128 | 1132 | ||
diff --git a/virt/kvm/arm/mmio.c b/virt/kvm/arm/mmio.c index dac7ceb1a677..08443a15e6be 100644 --- a/virt/kvm/arm/mmio.c +++ b/virt/kvm/arm/mmio.c | |||
| @@ -117,6 +117,12 @@ int kvm_handle_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run) | |||
| 117 | vcpu_set_reg(vcpu, vcpu->arch.mmio_decode.rt, data); | 117 | vcpu_set_reg(vcpu, vcpu->arch.mmio_decode.rt, data); |
| 118 | } | 118 | } |
| 119 | 119 | ||
| 120 | /* | ||
| 121 | * The MMIO instruction is emulated and should not be re-executed | ||
| 122 | * in the guest. | ||
| 123 | */ | ||
| 124 | kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu)); | ||
| 125 | |||
| 120 | return 0; | 126 | return 0; |
| 121 | } | 127 | } |
| 122 | 128 | ||
| @@ -144,11 +150,6 @@ static int decode_hsr(struct kvm_vcpu *vcpu, bool *is_write, int *len) | |||
| 144 | vcpu->arch.mmio_decode.sign_extend = sign_extend; | 150 | vcpu->arch.mmio_decode.sign_extend = sign_extend; |
| 145 | vcpu->arch.mmio_decode.rt = rt; | 151 | vcpu->arch.mmio_decode.rt = rt; |
| 146 | 152 | ||
| 147 | /* | ||
| 148 | * The MMIO instruction is emulated and should not be re-executed | ||
| 149 | * in the guest. | ||
| 150 | */ | ||
| 151 | kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu)); | ||
| 152 | return 0; | 153 | return 0; |
| 153 | } | 154 | } |
| 154 | 155 | ||
diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c index 5eca48bdb1a6..3053bf2584f8 100644 --- a/virt/kvm/arm/mmu.c +++ b/virt/kvm/arm/mmu.c | |||
| @@ -115,6 +115,25 @@ static void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t addr, pmd_t *pmd) | |||
| 115 | put_page(virt_to_page(pmd)); | 115 | put_page(virt_to_page(pmd)); |
| 116 | } | 116 | } |
| 117 | 117 | ||
| 118 | /** | ||
| 119 | * stage2_dissolve_pud() - clear and flush huge PUD entry | ||
| 120 | * @kvm: pointer to kvm structure. | ||
| 121 | * @addr: IPA | ||
| 122 | * @pud: pud pointer for IPA | ||
| 123 | * | ||
| 124 | * Function clears a PUD entry, flushes addr 1st and 2nd stage TLBs. Marks all | ||
| 125 | * pages in the range dirty. | ||
| 126 | */ | ||
| 127 | static void stage2_dissolve_pud(struct kvm *kvm, phys_addr_t addr, pud_t *pudp) | ||
| 128 | { | ||
| 129 | if (!stage2_pud_huge(kvm, *pudp)) | ||
| 130 | return; | ||
| 131 | |||
| 132 | stage2_pud_clear(kvm, pudp); | ||
| 133 | kvm_tlb_flush_vmid_ipa(kvm, addr); | ||
| 134 | put_page(virt_to_page(pudp)); | ||
| 135 | } | ||
| 136 | |||
| 118 | static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, | 137 | static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, |
| 119 | int min, int max) | 138 | int min, int max) |
| 120 | { | 139 | { |
| @@ -607,7 +626,7 @@ static void create_hyp_pte_mappings(pmd_t *pmd, unsigned long start, | |||
| 607 | addr = start; | 626 | addr = start; |
| 608 | do { | 627 | do { |
| 609 | pte = pte_offset_kernel(pmd, addr); | 628 | pte = pte_offset_kernel(pmd, addr); |
| 610 | kvm_set_pte(pte, pfn_pte(pfn, prot)); | 629 | kvm_set_pte(pte, kvm_pfn_pte(pfn, prot)); |
| 611 | get_page(virt_to_page(pte)); | 630 | get_page(virt_to_page(pte)); |
| 612 | pfn++; | 631 | pfn++; |
| 613 | } while (addr += PAGE_SIZE, addr != end); | 632 | } while (addr += PAGE_SIZE, addr != end); |
| @@ -1022,7 +1041,7 @@ static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache | |||
| 1022 | pmd_t *pmd; | 1041 | pmd_t *pmd; |
| 1023 | 1042 | ||
| 1024 | pud = stage2_get_pud(kvm, cache, addr); | 1043 | pud = stage2_get_pud(kvm, cache, addr); |
| 1025 | if (!pud) | 1044 | if (!pud || stage2_pud_huge(kvm, *pud)) |
| 1026 | return NULL; | 1045 | return NULL; |
| 1027 | 1046 | ||
| 1028 | if (stage2_pud_none(kvm, *pud)) { | 1047 | if (stage2_pud_none(kvm, *pud)) { |
| @@ -1083,29 +1102,103 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache | |||
| 1083 | return 0; | 1102 | return 0; |
| 1084 | } | 1103 | } |
| 1085 | 1104 | ||
| 1086 | static bool stage2_is_exec(struct kvm *kvm, phys_addr_t addr) | 1105 | static int stage2_set_pud_huge(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, |
| 1106 | phys_addr_t addr, const pud_t *new_pudp) | ||
| 1107 | { | ||
| 1108 | pud_t *pudp, old_pud; | ||
| 1109 | |||
| 1110 | pudp = stage2_get_pud(kvm, cache, addr); | ||
| 1111 | VM_BUG_ON(!pudp); | ||
| 1112 | |||
| 1113 | old_pud = *pudp; | ||
| 1114 | |||
| 1115 | /* | ||
| 1116 | * A large number of vcpus faulting on the same stage 2 entry, | ||
| 1117 | * can lead to a refault due to the | ||
| 1118 | * stage2_pud_clear()/tlb_flush(). Skip updating the page | ||
| 1119 | * tables if there is no change. | ||
| 1120 | */ | ||
| 1121 | if (pud_val(old_pud) == pud_val(*new_pudp)) | ||
| 1122 | return 0; | ||
| 1123 | |||
| 1124 | if (stage2_pud_present(kvm, old_pud)) { | ||
| 1125 | stage2_pud_clear(kvm, pudp); | ||
| 1126 | kvm_tlb_flush_vmid_ipa(kvm, addr); | ||
| 1127 | } else { | ||
| 1128 | get_page(virt_to_page(pudp)); | ||
| 1129 | } | ||
| 1130 | |||
| 1131 | kvm_set_pud(pudp, *new_pudp); | ||
| 1132 | return 0; | ||
| 1133 | } | ||
| 1134 | |||
| 1135 | /* | ||
| 1136 | * stage2_get_leaf_entry - walk the stage2 VM page tables and return | ||
| 1137 | * true if a valid and present leaf-entry is found. A pointer to the | ||
| 1138 | * leaf-entry is returned in the appropriate level variable - pudpp, | ||
| 1139 | * pmdpp, ptepp. | ||
| 1140 | */ | ||
| 1141 | static bool stage2_get_leaf_entry(struct kvm *kvm, phys_addr_t addr, | ||
| 1142 | pud_t **pudpp, pmd_t **pmdpp, pte_t **ptepp) | ||
| 1087 | { | 1143 | { |
| 1144 | pud_t *pudp; | ||
| 1088 | pmd_t *pmdp; | 1145 | pmd_t *pmdp; |
| 1089 | pte_t *ptep; | 1146 | pte_t *ptep; |
| 1090 | 1147 | ||
| 1091 | pmdp = stage2_get_pmd(kvm, NULL, addr); | 1148 | *pudpp = NULL; |
| 1149 | *pmdpp = NULL; | ||
| 1150 | *ptepp = NULL; | ||
| 1151 | |||
| 1152 | pudp = stage2_get_pud(kvm, NULL, addr); | ||
| 1153 | if (!pudp || stage2_pud_none(kvm, *pudp) || !stage2_pud_present(kvm, *pudp)) | ||
| 1154 | return false; | ||
| 1155 | |||
| 1156 | if (stage2_pud_huge(kvm, *pudp)) { | ||
| 1157 | *pudpp = pudp; | ||
| 1158 | return true; | ||
| 1159 | } | ||
| 1160 | |||
| 1161 | pmdp = stage2_pmd_offset(kvm, pudp, addr); | ||
| 1092 | if (!pmdp || pmd_none(*pmdp) || !pmd_present(*pmdp)) | 1162 | if (!pmdp || pmd_none(*pmdp) || !pmd_present(*pmdp)) |
| 1093 | return false; | 1163 | return false; |
| 1094 | 1164 | ||
| 1095 | if (pmd_thp_or_huge(*pmdp)) | 1165 | if (pmd_thp_or_huge(*pmdp)) { |
| 1096 | return kvm_s2pmd_exec(pmdp); | 1166 | *pmdpp = pmdp; |
| 1167 | return true; | ||
| 1168 | } | ||
| 1097 | 1169 | ||
| 1098 | ptep = pte_offset_kernel(pmdp, addr); | 1170 | ptep = pte_offset_kernel(pmdp, addr); |
| 1099 | if (!ptep || pte_none(*ptep) || !pte_present(*ptep)) | 1171 | if (!ptep || pte_none(*ptep) || !pte_present(*ptep)) |
| 1100 | return false; | 1172 | return false; |
| 1101 | 1173 | ||
| 1102 | return kvm_s2pte_exec(ptep); | 1174 | *ptepp = ptep; |
| 1175 | return true; | ||
| 1176 | } | ||
| 1177 | |||
| 1178 | static bool stage2_is_exec(struct kvm *kvm, phys_addr_t addr) | ||
| 1179 | { | ||
| 1180 | pud_t *pudp; | ||
| 1181 | pmd_t *pmdp; | ||
| 1182 | pte_t *ptep; | ||
| 1183 | bool found; | ||
| 1184 | |||
| 1185 | found = stage2_get_leaf_entry(kvm, addr, &pudp, &pmdp, &ptep); | ||
| 1186 | if (!found) | ||
| 1187 | return false; | ||
| 1188 | |||
| 1189 | if (pudp) | ||
| 1190 | return kvm_s2pud_exec(pudp); | ||
| 1191 | else if (pmdp) | ||
| 1192 | return kvm_s2pmd_exec(pmdp); | ||
| 1193 | else | ||
| 1194 | return kvm_s2pte_exec(ptep); | ||
| 1103 | } | 1195 | } |
| 1104 | 1196 | ||
| 1105 | static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, | 1197 | static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, |
| 1106 | phys_addr_t addr, const pte_t *new_pte, | 1198 | phys_addr_t addr, const pte_t *new_pte, |
| 1107 | unsigned long flags) | 1199 | unsigned long flags) |
| 1108 | { | 1200 | { |
| 1201 | pud_t *pud; | ||
| 1109 | pmd_t *pmd; | 1202 | pmd_t *pmd; |
| 1110 | pte_t *pte, old_pte; | 1203 | pte_t *pte, old_pte; |
| 1111 | bool iomap = flags & KVM_S2PTE_FLAG_IS_IOMAP; | 1204 | bool iomap = flags & KVM_S2PTE_FLAG_IS_IOMAP; |
| @@ -1114,7 +1207,31 @@ static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, | |||
| 1114 | VM_BUG_ON(logging_active && !cache); | 1207 | VM_BUG_ON(logging_active && !cache); |
| 1115 | 1208 | ||
| 1116 | /* Create stage-2 page table mapping - Levels 0 and 1 */ | 1209 | /* Create stage-2 page table mapping - Levels 0 and 1 */ |
| 1117 | pmd = stage2_get_pmd(kvm, cache, addr); | 1210 | pud = stage2_get_pud(kvm, cache, addr); |
| 1211 | if (!pud) { | ||
| 1212 | /* | ||
| 1213 | * Ignore calls from kvm_set_spte_hva for unallocated | ||
| 1214 | * address ranges. | ||
| 1215 | */ | ||
| 1216 | return 0; | ||
| 1217 | } | ||
| 1218 | |||
| 1219 | /* | ||
| 1220 | * While dirty page logging - dissolve huge PUD, then continue | ||
| 1221 | * on to allocate page. | ||
| 1222 | */ | ||
| 1223 | if (logging_active) | ||
| 1224 | stage2_dissolve_pud(kvm, addr, pud); | ||
| 1225 | |||
| 1226 | if (stage2_pud_none(kvm, *pud)) { | ||
| 1227 | if (!cache) | ||
| 1228 | return 0; /* ignore calls from kvm_set_spte_hva */ | ||
| 1229 | pmd = mmu_memory_cache_alloc(cache); | ||
| 1230 | stage2_pud_populate(kvm, pud, pmd); | ||
| 1231 | get_page(virt_to_page(pud)); | ||
| 1232 | } | ||
| 1233 | |||
| 1234 | pmd = stage2_pmd_offset(kvm, pud, addr); | ||
| 1118 | if (!pmd) { | 1235 | if (!pmd) { |
| 1119 | /* | 1236 | /* |
| 1120 | * Ignore calls from kvm_set_spte_hva for unallocated | 1237 | * Ignore calls from kvm_set_spte_hva for unallocated |
| @@ -1182,6 +1299,11 @@ static int stage2_pmdp_test_and_clear_young(pmd_t *pmd) | |||
| 1182 | return stage2_ptep_test_and_clear_young((pte_t *)pmd); | 1299 | return stage2_ptep_test_and_clear_young((pte_t *)pmd); |
| 1183 | } | 1300 | } |
| 1184 | 1301 | ||
| 1302 | static int stage2_pudp_test_and_clear_young(pud_t *pud) | ||
| 1303 | { | ||
| 1304 | return stage2_ptep_test_and_clear_young((pte_t *)pud); | ||
| 1305 | } | ||
| 1306 | |||
| 1185 | /** | 1307 | /** |
| 1186 | * kvm_phys_addr_ioremap - map a device range to guest IPA | 1308 | * kvm_phys_addr_ioremap - map a device range to guest IPA |
| 1187 | * | 1309 | * |
| @@ -1202,7 +1324,7 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, | |||
| 1202 | pfn = __phys_to_pfn(pa); | 1324 | pfn = __phys_to_pfn(pa); |
| 1203 | 1325 | ||
| 1204 | for (addr = guest_ipa; addr < end; addr += PAGE_SIZE) { | 1326 | for (addr = guest_ipa; addr < end; addr += PAGE_SIZE) { |
| 1205 | pte_t pte = pfn_pte(pfn, PAGE_S2_DEVICE); | 1327 | pte_t pte = kvm_pfn_pte(pfn, PAGE_S2_DEVICE); |
| 1206 | 1328 | ||
| 1207 | if (writable) | 1329 | if (writable) |
| 1208 | pte = kvm_s2pte_mkwrite(pte); | 1330 | pte = kvm_s2pte_mkwrite(pte); |
| @@ -1234,7 +1356,7 @@ static bool transparent_hugepage_adjust(kvm_pfn_t *pfnp, phys_addr_t *ipap) | |||
| 1234 | struct page *page = pfn_to_page(pfn); | 1356 | struct page *page = pfn_to_page(pfn); |
| 1235 | 1357 | ||
| 1236 | /* | 1358 | /* |
| 1237 | * PageTransCompoungMap() returns true for THP and | 1359 | * PageTransCompoundMap() returns true for THP and |
| 1238 | * hugetlbfs. Make sure the adjustment is done only for THP | 1360 | * hugetlbfs. Make sure the adjustment is done only for THP |
| 1239 | * pages. | 1361 | * pages. |
| 1240 | */ | 1362 | */ |
| @@ -1347,9 +1469,12 @@ static void stage2_wp_puds(struct kvm *kvm, pgd_t *pgd, | |||
| 1347 | do { | 1469 | do { |
| 1348 | next = stage2_pud_addr_end(kvm, addr, end); | 1470 | next = stage2_pud_addr_end(kvm, addr, end); |
| 1349 | if (!stage2_pud_none(kvm, *pud)) { | 1471 | if (!stage2_pud_none(kvm, *pud)) { |
| 1350 | /* TODO:PUD not supported, revisit later if supported */ | 1472 | if (stage2_pud_huge(kvm, *pud)) { |
| 1351 | BUG_ON(stage2_pud_huge(kvm, *pud)); | 1473 | if (!kvm_s2pud_readonly(pud)) |
| 1352 | stage2_wp_pmds(kvm, pud, addr, next); | 1474 | kvm_set_s2pud_readonly(pud); |
| 1475 | } else { | ||
| 1476 | stage2_wp_pmds(kvm, pud, addr, next); | ||
| 1477 | } | ||
| 1353 | } | 1478 | } |
| 1354 | } while (pud++, addr = next, addr != end); | 1479 | } while (pud++, addr = next, addr != end); |
| 1355 | } | 1480 | } |
| @@ -1392,7 +1517,7 @@ static void stage2_wp_range(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) | |||
| 1392 | * | 1517 | * |
| 1393 | * Called to start logging dirty pages after memory region | 1518 | * Called to start logging dirty pages after memory region |
| 1394 | * KVM_MEM_LOG_DIRTY_PAGES operation is called. After this function returns | 1519 | * KVM_MEM_LOG_DIRTY_PAGES operation is called. After this function returns |
| 1395 | * all present PMD and PTEs are write protected in the memory region. | 1520 | * all present PUD, PMD and PTEs are write protected in the memory region. |
| 1396 | * Afterwards read of dirty page log can be called. | 1521 | * Afterwards read of dirty page log can be called. |
| 1397 | * | 1522 | * |
| 1398 | * Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired, | 1523 | * Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired, |
| @@ -1470,12 +1595,70 @@ static void kvm_send_hwpoison_signal(unsigned long address, | |||
| 1470 | send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, current); | 1595 | send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, current); |
| 1471 | } | 1596 | } |
| 1472 | 1597 | ||
| 1598 | static bool fault_supports_stage2_pmd_mappings(struct kvm_memory_slot *memslot, | ||
| 1599 | unsigned long hva) | ||
| 1600 | { | ||
| 1601 | gpa_t gpa_start, gpa_end; | ||
| 1602 | hva_t uaddr_start, uaddr_end; | ||
| 1603 | size_t size; | ||
| 1604 | |||
| 1605 | size = memslot->npages * PAGE_SIZE; | ||
| 1606 | |||
| 1607 | gpa_start = memslot->base_gfn << PAGE_SHIFT; | ||
| 1608 | gpa_end = gpa_start + size; | ||
| 1609 | |||
| 1610 | uaddr_start = memslot->userspace_addr; | ||
| 1611 | uaddr_end = uaddr_start + size; | ||
| 1612 | |||
| 1613 | /* | ||
| 1614 | * Pages belonging to memslots that don't have the same alignment | ||
| 1615 | * within a PMD for userspace and IPA cannot be mapped with stage-2 | ||
| 1616 | * PMD entries, because we'll end up mapping the wrong pages. | ||
| 1617 | * | ||
| 1618 | * Consider a layout like the following: | ||
| 1619 | * | ||
| 1620 | * memslot->userspace_addr: | ||
| 1621 | * +-----+--------------------+--------------------+---+ | ||
| 1622 | * |abcde|fgh Stage-1 PMD | Stage-1 PMD tv|xyz| | ||
| 1623 | * +-----+--------------------+--------------------+---+ | ||
| 1624 | * | ||
| 1625 | * memslot->base_gfn << PAGE_SIZE: | ||
| 1626 | * +---+--------------------+--------------------+-----+ | ||
| 1627 | * |abc|def Stage-2 PMD | Stage-2 PMD |tvxyz| | ||
| 1628 | * +---+--------------------+--------------------+-----+ | ||
| 1629 | * | ||
| 1630 | * If we create those stage-2 PMDs, we'll end up with this incorrect | ||
| 1631 | * mapping: | ||
| 1632 | * d -> f | ||
| 1633 | * e -> g | ||
| 1634 | * f -> h | ||
| 1635 | */ | ||
| 1636 | if ((gpa_start & ~S2_PMD_MASK) != (uaddr_start & ~S2_PMD_MASK)) | ||
| 1637 | return false; | ||
| 1638 | |||
| 1639 | /* | ||
| 1640 | * Next, let's make sure we're not trying to map anything not covered | ||
| 1641 | * by the memslot. This means we have to prohibit PMD size mappings | ||
| 1642 | * for the beginning and end of a non-PMD aligned and non-PMD sized | ||
| 1643 | * memory slot (illustrated by the head and tail parts of the | ||
| 1644 | * userspace view above containing pages 'abcde' and 'xyz', | ||
| 1645 | * respectively). | ||
| 1646 | * | ||
| 1647 | * Note that it doesn't matter if we do the check using the | ||
| 1648 | * userspace_addr or the base_gfn, as both are equally aligned (per | ||
| 1649 | * the check above) and equally sized. | ||
| 1650 | */ | ||
| 1651 | return (hva & S2_PMD_MASK) >= uaddr_start && | ||
| 1652 | (hva & S2_PMD_MASK) + S2_PMD_SIZE <= uaddr_end; | ||
| 1653 | } | ||
| 1654 | |||
| 1473 | static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, | 1655 | static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, |
| 1474 | struct kvm_memory_slot *memslot, unsigned long hva, | 1656 | struct kvm_memory_slot *memslot, unsigned long hva, |
| 1475 | unsigned long fault_status) | 1657 | unsigned long fault_status) |
| 1476 | { | 1658 | { |
| 1477 | int ret; | 1659 | int ret; |
| 1478 | bool write_fault, exec_fault, writable, hugetlb = false, force_pte = false; | 1660 | bool write_fault, writable, force_pte = false; |
| 1661 | bool exec_fault, needs_exec; | ||
| 1479 | unsigned long mmu_seq; | 1662 | unsigned long mmu_seq; |
| 1480 | gfn_t gfn = fault_ipa >> PAGE_SHIFT; | 1663 | gfn_t gfn = fault_ipa >> PAGE_SHIFT; |
| 1481 | struct kvm *kvm = vcpu->kvm; | 1664 | struct kvm *kvm = vcpu->kvm; |
| @@ -1484,7 +1667,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, | |||
| 1484 | kvm_pfn_t pfn; | 1667 | kvm_pfn_t pfn; |
| 1485 | pgprot_t mem_type = PAGE_S2; | 1668 | pgprot_t mem_type = PAGE_S2; |
| 1486 | bool logging_active = memslot_is_logging(memslot); | 1669 | bool logging_active = memslot_is_logging(memslot); |
| 1487 | unsigned long flags = 0; | 1670 | unsigned long vma_pagesize, flags = 0; |
| 1488 | 1671 | ||
| 1489 | write_fault = kvm_is_write_fault(vcpu); | 1672 | write_fault = kvm_is_write_fault(vcpu); |
| 1490 | exec_fault = kvm_vcpu_trap_is_iabt(vcpu); | 1673 | exec_fault = kvm_vcpu_trap_is_iabt(vcpu); |
| @@ -1495,6 +1678,12 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, | |||
| 1495 | return -EFAULT; | 1678 | return -EFAULT; |
| 1496 | } | 1679 | } |
| 1497 | 1680 | ||
| 1681 | if (!fault_supports_stage2_pmd_mappings(memslot, hva)) | ||
| 1682 | force_pte = true; | ||
| 1683 | |||
| 1684 | if (logging_active) | ||
| 1685 | force_pte = true; | ||
| 1686 | |||
| 1498 | /* Let's check if we will get back a huge page backed by hugetlbfs */ | 1687 | /* Let's check if we will get back a huge page backed by hugetlbfs */ |
| 1499 | down_read(¤t->mm->mmap_sem); | 1688 | down_read(¤t->mm->mmap_sem); |
| 1500 | vma = find_vma_intersection(current->mm, hva, hva + 1); | 1689 | vma = find_vma_intersection(current->mm, hva, hva + 1); |
| @@ -1504,22 +1693,15 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, | |||
| 1504 | return -EFAULT; | 1693 | return -EFAULT; |
| 1505 | } | 1694 | } |
| 1506 | 1695 | ||
| 1507 | if (vma_kernel_pagesize(vma) == PMD_SIZE && !logging_active) { | 1696 | vma_pagesize = vma_kernel_pagesize(vma); |
| 1508 | hugetlb = true; | 1697 | /* |
| 1509 | gfn = (fault_ipa & PMD_MASK) >> PAGE_SHIFT; | 1698 | * PUD level may not exist for a VM but PMD is guaranteed to |
| 1510 | } else { | 1699 | * exist. |
| 1511 | /* | 1700 | */ |
| 1512 | * Pages belonging to memslots that don't have the same | 1701 | if ((vma_pagesize == PMD_SIZE || |
| 1513 | * alignment for userspace and IPA cannot be mapped using | 1702 | (vma_pagesize == PUD_SIZE && kvm_stage2_has_pud(kvm))) && |
| 1514 | * block descriptors even if the pages belong to a THP for | 1703 | !force_pte) { |
| 1515 | * the process, because the stage-2 block descriptor will | 1704 | gfn = (fault_ipa & huge_page_mask(hstate_vma(vma))) >> PAGE_SHIFT; |
| 1516 | * cover more than a single THP and we loose atomicity for | ||
| 1517 | * unmapping, updates, and splits of the THP or other pages | ||
| 1518 | * in the stage-2 block range. | ||
| 1519 | */ | ||
| 1520 | if ((memslot->userspace_addr & ~PMD_MASK) != | ||
| 1521 | ((memslot->base_gfn << PAGE_SHIFT) & ~PMD_MASK)) | ||
| 1522 | force_pte = true; | ||
| 1523 | } | 1705 | } |
| 1524 | up_read(¤t->mm->mmap_sem); | 1706 | up_read(¤t->mm->mmap_sem); |
| 1525 | 1707 | ||
| @@ -1558,7 +1740,6 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, | |||
| 1558 | * should not be mapped with huge pages (it introduces churn | 1740 | * should not be mapped with huge pages (it introduces churn |
| 1559 | * and performance degradation), so force a pte mapping. | 1741 | * and performance degradation), so force a pte mapping. |
| 1560 | */ | 1742 | */ |
| 1561 | force_pte = true; | ||
| 1562 | flags |= KVM_S2_FLAG_LOGGING_ACTIVE; | 1743 | flags |= KVM_S2_FLAG_LOGGING_ACTIVE; |
| 1563 | 1744 | ||
| 1564 | /* | 1745 | /* |
| @@ -1573,50 +1754,69 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, | |||
| 1573 | if (mmu_notifier_retry(kvm, mmu_seq)) | 1754 | if (mmu_notifier_retry(kvm, mmu_seq)) |
| 1574 | goto out_unlock; | 1755 | goto out_unlock; |
| 1575 | 1756 | ||
| 1576 | if (!hugetlb && !force_pte) | 1757 | if (vma_pagesize == PAGE_SIZE && !force_pte) { |
| 1577 | hugetlb = transparent_hugepage_adjust(&pfn, &fault_ipa); | 1758 | /* |
| 1759 | * Only PMD_SIZE transparent hugepages(THP) are | ||
| 1760 | * currently supported. This code will need to be | ||
| 1761 | * updated to support other THP sizes. | ||
| 1762 | */ | ||
| 1763 | if (transparent_hugepage_adjust(&pfn, &fault_ipa)) | ||
| 1764 | vma_pagesize = PMD_SIZE; | ||
| 1765 | } | ||
| 1578 | 1766 | ||
| 1579 | if (hugetlb) { | 1767 | if (writable) |
| 1580 | pmd_t new_pmd = pfn_pmd(pfn, mem_type); | 1768 | kvm_set_pfn_dirty(pfn); |
| 1581 | new_pmd = pmd_mkhuge(new_pmd); | ||
| 1582 | if (writable) { | ||
| 1583 | new_pmd = kvm_s2pmd_mkwrite(new_pmd); | ||
| 1584 | kvm_set_pfn_dirty(pfn); | ||
| 1585 | } | ||
| 1586 | 1769 | ||
| 1587 | if (fault_status != FSC_PERM) | 1770 | if (fault_status != FSC_PERM) |
| 1588 | clean_dcache_guest_page(pfn, PMD_SIZE); | 1771 | clean_dcache_guest_page(pfn, vma_pagesize); |
| 1589 | 1772 | ||
| 1590 | if (exec_fault) { | 1773 | if (exec_fault) |
| 1774 | invalidate_icache_guest_page(pfn, vma_pagesize); | ||
| 1775 | |||
| 1776 | /* | ||
| 1777 | * If we took an execution fault we have made the | ||
| 1778 | * icache/dcache coherent above and should now let the s2 | ||
| 1779 | * mapping be executable. | ||
| 1780 | * | ||
| 1781 | * Write faults (!exec_fault && FSC_PERM) are orthogonal to | ||
| 1782 | * execute permissions, and we preserve whatever we have. | ||
| 1783 | */ | ||
| 1784 | needs_exec = exec_fault || | ||
| 1785 | (fault_status == FSC_PERM && stage2_is_exec(kvm, fault_ipa)); | ||
| 1786 | |||
| 1787 | if (vma_pagesize == PUD_SIZE) { | ||
| 1788 | pud_t new_pud = kvm_pfn_pud(pfn, mem_type); | ||
| 1789 | |||
| 1790 | new_pud = kvm_pud_mkhuge(new_pud); | ||
| 1791 | if (writable) | ||
| 1792 | new_pud = kvm_s2pud_mkwrite(new_pud); | ||
| 1793 | |||
| 1794 | if (needs_exec) | ||
| 1795 | new_pud = kvm_s2pud_mkexec(new_pud); | ||
| 1796 | |||
| 1797 | ret = stage2_set_pud_huge(kvm, memcache, fault_ipa, &new_pud); | ||
| 1798 | } else if (vma_pagesize == PMD_SIZE) { | ||
| 1799 | pmd_t new_pmd = kvm_pfn_pmd(pfn, mem_type); | ||
| 1800 | |||
| 1801 | new_pmd = kvm_pmd_mkhuge(new_pmd); | ||
| 1802 | |||
| 1803 | if (writable) | ||
| 1804 | new_pmd = kvm_s2pmd_mkwrite(new_pmd); | ||
| 1805 | |||
| 1806 | if (needs_exec) | ||
| 1591 | new_pmd = kvm_s2pmd_mkexec(new_pmd); | 1807 | new_pmd = kvm_s2pmd_mkexec(new_pmd); |
| 1592 | invalidate_icache_guest_page(pfn, PMD_SIZE); | ||
| 1593 | } else if (fault_status == FSC_PERM) { | ||
| 1594 | /* Preserve execute if XN was already cleared */ | ||
| 1595 | if (stage2_is_exec(kvm, fault_ipa)) | ||
| 1596 | new_pmd = kvm_s2pmd_mkexec(new_pmd); | ||
| 1597 | } | ||
| 1598 | 1808 | ||
| 1599 | ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd); | 1809 | ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd); |
| 1600 | } else { | 1810 | } else { |
| 1601 | pte_t new_pte = pfn_pte(pfn, mem_type); | 1811 | pte_t new_pte = kvm_pfn_pte(pfn, mem_type); |
| 1602 | 1812 | ||
| 1603 | if (writable) { | 1813 | if (writable) { |
| 1604 | new_pte = kvm_s2pte_mkwrite(new_pte); | 1814 | new_pte = kvm_s2pte_mkwrite(new_pte); |
| 1605 | kvm_set_pfn_dirty(pfn); | ||
| 1606 | mark_page_dirty(kvm, gfn); | 1815 | mark_page_dirty(kvm, gfn); |
| 1607 | } | 1816 | } |
| 1608 | 1817 | ||
| 1609 | if (fault_status != FSC_PERM) | 1818 | if (needs_exec) |
| 1610 | clean_dcache_guest_page(pfn, PAGE_SIZE); | ||
| 1611 | |||
| 1612 | if (exec_fault) { | ||
| 1613 | new_pte = kvm_s2pte_mkexec(new_pte); | 1819 | new_pte = kvm_s2pte_mkexec(new_pte); |
| 1614 | invalidate_icache_guest_page(pfn, PAGE_SIZE); | ||
| 1615 | } else if (fault_status == FSC_PERM) { | ||
| 1616 | /* Preserve execute if XN was already cleared */ | ||
| 1617 | if (stage2_is_exec(kvm, fault_ipa)) | ||
| 1618 | new_pte = kvm_s2pte_mkexec(new_pte); | ||
| 1619 | } | ||
| 1620 | 1820 | ||
| 1621 | ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, flags); | 1821 | ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, flags); |
| 1622 | } | 1822 | } |
| @@ -1637,6 +1837,7 @@ out_unlock: | |||
| 1637 | */ | 1837 | */ |
| 1638 | static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) | 1838 | static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) |
| 1639 | { | 1839 | { |
| 1840 | pud_t *pud; | ||
| 1640 | pmd_t *pmd; | 1841 | pmd_t *pmd; |
| 1641 | pte_t *pte; | 1842 | pte_t *pte; |
| 1642 | kvm_pfn_t pfn; | 1843 | kvm_pfn_t pfn; |
| @@ -1646,24 +1847,23 @@ static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) | |||
| 1646 | 1847 | ||
| 1647 | spin_lock(&vcpu->kvm->mmu_lock); | 1848 | spin_lock(&vcpu->kvm->mmu_lock); |
| 1648 | 1849 | ||
| 1649 | pmd = stage2_get_pmd(vcpu->kvm, NULL, fault_ipa); | 1850 | if (!stage2_get_leaf_entry(vcpu->kvm, fault_ipa, &pud, &pmd, &pte)) |
| 1650 | if (!pmd || pmd_none(*pmd)) /* Nothing there */ | ||
| 1651 | goto out; | 1851 | goto out; |
| 1652 | 1852 | ||
| 1653 | if (pmd_thp_or_huge(*pmd)) { /* THP, HugeTLB */ | 1853 | if (pud) { /* HugeTLB */ |
| 1854 | *pud = kvm_s2pud_mkyoung(*pud); | ||
| 1855 | pfn = kvm_pud_pfn(*pud); | ||
| 1856 | pfn_valid = true; | ||
| 1857 | } else if (pmd) { /* THP, HugeTLB */ | ||
| 1654 | *pmd = pmd_mkyoung(*pmd); | 1858 | *pmd = pmd_mkyoung(*pmd); |
| 1655 | pfn = pmd_pfn(*pmd); | 1859 | pfn = pmd_pfn(*pmd); |
| 1656 | pfn_valid = true; | 1860 | pfn_valid = true; |
| 1657 | goto out; | 1861 | } else { |
| 1862 | *pte = pte_mkyoung(*pte); /* Just a page... */ | ||
| 1863 | pfn = pte_pfn(*pte); | ||
| 1864 | pfn_valid = true; | ||
| 1658 | } | 1865 | } |
| 1659 | 1866 | ||
| 1660 | pte = pte_offset_kernel(pmd, fault_ipa); | ||
| 1661 | if (pte_none(*pte)) /* Nothing there either */ | ||
| 1662 | goto out; | ||
| 1663 | |||
| 1664 | *pte = pte_mkyoung(*pte); /* Just a page... */ | ||
| 1665 | pfn = pte_pfn(*pte); | ||
| 1666 | pfn_valid = true; | ||
| 1667 | out: | 1867 | out: |
| 1668 | spin_unlock(&vcpu->kvm->mmu_lock); | 1868 | spin_unlock(&vcpu->kvm->mmu_lock); |
| 1669 | if (pfn_valid) | 1869 | if (pfn_valid) |
| @@ -1849,14 +2049,14 @@ static int kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data | |||
| 1849 | } | 2049 | } |
| 1850 | 2050 | ||
| 1851 | 2051 | ||
| 1852 | void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) | 2052 | int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) |
| 1853 | { | 2053 | { |
| 1854 | unsigned long end = hva + PAGE_SIZE; | 2054 | unsigned long end = hva + PAGE_SIZE; |
| 1855 | kvm_pfn_t pfn = pte_pfn(pte); | 2055 | kvm_pfn_t pfn = pte_pfn(pte); |
| 1856 | pte_t stage2_pte; | 2056 | pte_t stage2_pte; |
| 1857 | 2057 | ||
| 1858 | if (!kvm->arch.pgd) | 2058 | if (!kvm->arch.pgd) |
| 1859 | return; | 2059 | return 0; |
| 1860 | 2060 | ||
| 1861 | trace_kvm_set_spte_hva(hva); | 2061 | trace_kvm_set_spte_hva(hva); |
| 1862 | 2062 | ||
| @@ -1865,48 +2065,46 @@ void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) | |||
| 1865 | * just like a translation fault and clean the cache to the PoC. | 2065 | * just like a translation fault and clean the cache to the PoC. |
| 1866 | */ | 2066 | */ |
| 1867 | clean_dcache_guest_page(pfn, PAGE_SIZE); | 2067 | clean_dcache_guest_page(pfn, PAGE_SIZE); |
| 1868 | stage2_pte = pfn_pte(pfn, PAGE_S2); | 2068 | stage2_pte = kvm_pfn_pte(pfn, PAGE_S2); |
| 1869 | handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &stage2_pte); | 2069 | handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &stage2_pte); |
| 2070 | |||
| 2071 | return 0; | ||
| 1870 | } | 2072 | } |
| 1871 | 2073 | ||
| 1872 | static int kvm_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data) | 2074 | static int kvm_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data) |
| 1873 | { | 2075 | { |
| 2076 | pud_t *pud; | ||
| 1874 | pmd_t *pmd; | 2077 | pmd_t *pmd; |
| 1875 | pte_t *pte; | 2078 | pte_t *pte; |
| 1876 | 2079 | ||
| 1877 | WARN_ON(size != PAGE_SIZE && size != PMD_SIZE); | 2080 | WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE); |
| 1878 | pmd = stage2_get_pmd(kvm, NULL, gpa); | 2081 | if (!stage2_get_leaf_entry(kvm, gpa, &pud, &pmd, &pte)) |
| 1879 | if (!pmd || pmd_none(*pmd)) /* Nothing there */ | ||
| 1880 | return 0; | 2082 | return 0; |
| 1881 | 2083 | ||
| 1882 | if (pmd_thp_or_huge(*pmd)) /* THP, HugeTLB */ | 2084 | if (pud) |
| 2085 | return stage2_pudp_test_and_clear_young(pud); | ||
| 2086 | else if (pmd) | ||
| 1883 | return stage2_pmdp_test_and_clear_young(pmd); | 2087 | return stage2_pmdp_test_and_clear_young(pmd); |
| 1884 | 2088 | else | |
| 1885 | pte = pte_offset_kernel(pmd, gpa); | 2089 | return stage2_ptep_test_and_clear_young(pte); |
| 1886 | if (pte_none(*pte)) | ||
| 1887 | return 0; | ||
| 1888 | |||
| 1889 | return stage2_ptep_test_and_clear_young(pte); | ||
| 1890 | } | 2090 | } |
| 1891 | 2091 | ||
| 1892 | static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data) | 2092 | static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data) |
| 1893 | { | 2093 | { |
| 2094 | pud_t *pud; | ||
| 1894 | pmd_t *pmd; | 2095 | pmd_t *pmd; |
| 1895 | pte_t *pte; | 2096 | pte_t *pte; |
| 1896 | 2097 | ||
| 1897 | WARN_ON(size != PAGE_SIZE && size != PMD_SIZE); | 2098 | WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE); |
| 1898 | pmd = stage2_get_pmd(kvm, NULL, gpa); | 2099 | if (!stage2_get_leaf_entry(kvm, gpa, &pud, &pmd, &pte)) |
| 1899 | if (!pmd || pmd_none(*pmd)) /* Nothing there */ | ||
| 1900 | return 0; | 2100 | return 0; |
| 1901 | 2101 | ||
| 1902 | if (pmd_thp_or_huge(*pmd)) /* THP, HugeTLB */ | 2102 | if (pud) |
| 2103 | return kvm_s2pud_young(*pud); | ||
| 2104 | else if (pmd) | ||
| 1903 | return pmd_young(*pmd); | 2105 | return pmd_young(*pmd); |
| 1904 | 2106 | else | |
| 1905 | pte = pte_offset_kernel(pmd, gpa); | ||
| 1906 | if (!pte_none(*pte)) /* Just a page... */ | ||
| 1907 | return pte_young(*pte); | 2107 | return pte_young(*pte); |
| 1908 | |||
| 1909 | return 0; | ||
| 1910 | } | 2108 | } |
| 1911 | 2109 | ||
| 1912 | int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) | 2110 | int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) |
diff --git a/virt/kvm/arm/trace.h b/virt/kvm/arm/trace.h index 57b3edebbb40..3828beab93f2 100644 --- a/virt/kvm/arm/trace.h +++ b/virt/kvm/arm/trace.h | |||
| @@ -26,25 +26,25 @@ TRACE_EVENT(kvm_entry, | |||
| 26 | ); | 26 | ); |
| 27 | 27 | ||
| 28 | TRACE_EVENT(kvm_exit, | 28 | TRACE_EVENT(kvm_exit, |
| 29 | TP_PROTO(int idx, unsigned int exit_reason, unsigned long vcpu_pc), | 29 | TP_PROTO(int ret, unsigned int esr_ec, unsigned long vcpu_pc), |
| 30 | TP_ARGS(idx, exit_reason, vcpu_pc), | 30 | TP_ARGS(ret, esr_ec, vcpu_pc), |
| 31 | 31 | ||
| 32 | TP_STRUCT__entry( | 32 | TP_STRUCT__entry( |
| 33 | __field( int, idx ) | 33 | __field( int, ret ) |
| 34 | __field( unsigned int, exit_reason ) | 34 | __field( unsigned int, esr_ec ) |
| 35 | __field( unsigned long, vcpu_pc ) | 35 | __field( unsigned long, vcpu_pc ) |
| 36 | ), | 36 | ), |
| 37 | 37 | ||
| 38 | TP_fast_assign( | 38 | TP_fast_assign( |
| 39 | __entry->idx = idx; | 39 | __entry->ret = ARM_EXCEPTION_CODE(ret); |
| 40 | __entry->exit_reason = exit_reason; | 40 | __entry->esr_ec = ARM_EXCEPTION_IS_TRAP(ret) ? esr_ec : 0; |
| 41 | __entry->vcpu_pc = vcpu_pc; | 41 | __entry->vcpu_pc = vcpu_pc; |
| 42 | ), | 42 | ), |
| 43 | 43 | ||
| 44 | TP_printk("%s: HSR_EC: 0x%04x (%s), PC: 0x%08lx", | 44 | TP_printk("%s: HSR_EC: 0x%04x (%s), PC: 0x%08lx", |
| 45 | __print_symbolic(__entry->idx, kvm_arm_exception_type), | 45 | __print_symbolic(__entry->ret, kvm_arm_exception_type), |
| 46 | __entry->exit_reason, | 46 | __entry->esr_ec, |
| 47 | __print_symbolic(__entry->exit_reason, kvm_arm_exception_class), | 47 | __print_symbolic(__entry->esr_ec, kvm_arm_exception_class), |
| 48 | __entry->vcpu_pc) | 48 | __entry->vcpu_pc) |
| 49 | ); | 49 | ); |
| 50 | 50 | ||
diff --git a/virt/kvm/arm/vgic/vgic-mmio.c b/virt/kvm/arm/vgic/vgic-mmio.c index f56ff1cf52ec..ceeda7e04a4d 100644 --- a/virt/kvm/arm/vgic/vgic-mmio.c +++ b/virt/kvm/arm/vgic/vgic-mmio.c | |||
| @@ -313,36 +313,30 @@ static void vgic_mmio_change_active(struct kvm_vcpu *vcpu, struct vgic_irq *irq, | |||
| 313 | 313 | ||
| 314 | spin_lock_irqsave(&irq->irq_lock, flags); | 314 | spin_lock_irqsave(&irq->irq_lock, flags); |
| 315 | 315 | ||
| 316 | /* | ||
| 317 | * If this virtual IRQ was written into a list register, we | ||
| 318 | * have to make sure the CPU that runs the VCPU thread has | ||
| 319 | * synced back the LR state to the struct vgic_irq. | ||
| 320 | * | ||
| 321 | * As long as the conditions below are true, we know the VCPU thread | ||
| 322 | * may be on its way back from the guest (we kicked the VCPU thread in | ||
| 323 | * vgic_change_active_prepare) and still has to sync back this IRQ, | ||
| 324 | * so we release and re-acquire the spin_lock to let the other thread | ||
| 325 | * sync back the IRQ. | ||
| 326 | * | ||
| 327 | * When accessing VGIC state from user space, requester_vcpu is | ||
| 328 | * NULL, which is fine, because we guarantee that no VCPUs are running | ||
| 329 | * when accessing VGIC state from user space so irq->vcpu->cpu is | ||
| 330 | * always -1. | ||
| 331 | */ | ||
| 332 | while (irq->vcpu && /* IRQ may have state in an LR somewhere */ | ||
| 333 | irq->vcpu != requester_vcpu && /* Current thread is not the VCPU thread */ | ||
| 334 | irq->vcpu->cpu != -1) /* VCPU thread is running */ | ||
| 335 | cond_resched_lock(&irq->irq_lock); | ||
| 336 | |||
| 337 | if (irq->hw) { | 316 | if (irq->hw) { |
| 338 | vgic_hw_irq_change_active(vcpu, irq, active, !requester_vcpu); | 317 | vgic_hw_irq_change_active(vcpu, irq, active, !requester_vcpu); |
| 339 | } else { | 318 | } else { |
| 340 | u32 model = vcpu->kvm->arch.vgic.vgic_model; | 319 | u32 model = vcpu->kvm->arch.vgic.vgic_model; |
| 320 | u8 active_source; | ||
| 341 | 321 | ||
| 342 | irq->active = active; | 322 | irq->active = active; |
| 323 | |||
| 324 | /* | ||
| 325 | * The GICv2 architecture indicates that the source CPUID for | ||
| 326 | * an SGI should be provided during an EOI which implies that | ||
| 327 | * the active state is stored somewhere, but at the same time | ||
| 328 | * this state is not architecturally exposed anywhere and we | ||
| 329 | * have no way of knowing the right source. | ||
| 330 | * | ||
| 331 | * This may lead to a VCPU not being able to receive | ||
| 332 | * additional instances of a particular SGI after migration | ||
| 333 | * for a GICv2 VM on some GIC implementations. Oh well. | ||
| 334 | */ | ||
| 335 | active_source = (requester_vcpu) ? requester_vcpu->vcpu_id : 0; | ||
| 336 | |||
| 343 | if (model == KVM_DEV_TYPE_ARM_VGIC_V2 && | 337 | if (model == KVM_DEV_TYPE_ARM_VGIC_V2 && |
| 344 | active && vgic_irq_is_sgi(irq->intid)) | 338 | active && vgic_irq_is_sgi(irq->intid)) |
| 345 | irq->active_source = requester_vcpu->vcpu_id; | 339 | irq->active_source = active_source; |
| 346 | } | 340 | } |
| 347 | 341 | ||
| 348 | if (irq->active) | 342 | if (irq->active) |
| @@ -368,14 +362,16 @@ static void vgic_mmio_change_active(struct kvm_vcpu *vcpu, struct vgic_irq *irq, | |||
| 368 | */ | 362 | */ |
| 369 | static void vgic_change_active_prepare(struct kvm_vcpu *vcpu, u32 intid) | 363 | static void vgic_change_active_prepare(struct kvm_vcpu *vcpu, u32 intid) |
| 370 | { | 364 | { |
| 371 | if (intid > VGIC_NR_PRIVATE_IRQS) | 365 | if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3 || |
| 366 | intid > VGIC_NR_PRIVATE_IRQS) | ||
| 372 | kvm_arm_halt_guest(vcpu->kvm); | 367 | kvm_arm_halt_guest(vcpu->kvm); |
| 373 | } | 368 | } |
| 374 | 369 | ||
| 375 | /* See vgic_change_active_prepare */ | 370 | /* See vgic_change_active_prepare */ |
| 376 | static void vgic_change_active_finish(struct kvm_vcpu *vcpu, u32 intid) | 371 | static void vgic_change_active_finish(struct kvm_vcpu *vcpu, u32 intid) |
| 377 | { | 372 | { |
| 378 | if (intid > VGIC_NR_PRIVATE_IRQS) | 373 | if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3 || |
| 374 | intid > VGIC_NR_PRIVATE_IRQS) | ||
| 379 | kvm_arm_resume_guest(vcpu->kvm); | 375 | kvm_arm_resume_guest(vcpu->kvm); |
| 380 | } | 376 | } |
| 381 | 377 | ||
diff --git a/virt/kvm/arm/vgic/vgic.c b/virt/kvm/arm/vgic/vgic.c index 7cfdfbc910e0..a6b135491b6c 100644 --- a/virt/kvm/arm/vgic/vgic.c +++ b/virt/kvm/arm/vgic/vgic.c | |||
| @@ -103,13 +103,13 @@ struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu, | |||
| 103 | { | 103 | { |
| 104 | /* SGIs and PPIs */ | 104 | /* SGIs and PPIs */ |
| 105 | if (intid <= VGIC_MAX_PRIVATE) { | 105 | if (intid <= VGIC_MAX_PRIVATE) { |
| 106 | intid = array_index_nospec(intid, VGIC_MAX_PRIVATE); | 106 | intid = array_index_nospec(intid, VGIC_MAX_PRIVATE + 1); |
| 107 | return &vcpu->arch.vgic_cpu.private_irqs[intid]; | 107 | return &vcpu->arch.vgic_cpu.private_irqs[intid]; |
| 108 | } | 108 | } |
| 109 | 109 | ||
| 110 | /* SPIs */ | 110 | /* SPIs */ |
| 111 | if (intid <= VGIC_MAX_SPI) { | 111 | if (intid < (kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS)) { |
| 112 | intid = array_index_nospec(intid, VGIC_MAX_SPI); | 112 | intid = array_index_nospec(intid, kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS); |
| 113 | return &kvm->arch.vgic.spis[intid - VGIC_NR_PRIVATE_IRQS]; | 113 | return &kvm->arch.vgic.spis[intid - VGIC_NR_PRIVATE_IRQS]; |
| 114 | } | 114 | } |
| 115 | 115 | ||
| @@ -908,6 +908,7 @@ int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu) | |||
| 908 | struct vgic_irq *irq; | 908 | struct vgic_irq *irq; |
| 909 | bool pending = false; | 909 | bool pending = false; |
| 910 | unsigned long flags; | 910 | unsigned long flags; |
| 911 | struct vgic_vmcr vmcr; | ||
| 911 | 912 | ||
| 912 | if (!vcpu->kvm->arch.vgic.enabled) | 913 | if (!vcpu->kvm->arch.vgic.enabled) |
| 913 | return false; | 914 | return false; |
| @@ -915,11 +916,15 @@ int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu) | |||
| 915 | if (vcpu->arch.vgic_cpu.vgic_v3.its_vpe.pending_last) | 916 | if (vcpu->arch.vgic_cpu.vgic_v3.its_vpe.pending_last) |
| 916 | return true; | 917 | return true; |
| 917 | 918 | ||
| 919 | vgic_get_vmcr(vcpu, &vmcr); | ||
| 920 | |||
| 918 | spin_lock_irqsave(&vgic_cpu->ap_list_lock, flags); | 921 | spin_lock_irqsave(&vgic_cpu->ap_list_lock, flags); |
| 919 | 922 | ||
| 920 | list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) { | 923 | list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) { |
| 921 | spin_lock(&irq->irq_lock); | 924 | spin_lock(&irq->irq_lock); |
| 922 | pending = irq_is_pending(irq) && irq->enabled; | 925 | pending = irq_is_pending(irq) && irq->enabled && |
| 926 | !irq->active && | ||
| 927 | irq->priority < vmcr.pmr; | ||
| 923 | spin_unlock(&irq->irq_lock); | 928 | spin_unlock(&irq->irq_lock); |
| 924 | 929 | ||
| 925 | if (pending) | 930 | if (pending) |
diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c index 23c2519c5b32..110cbe3f74f8 100644 --- a/virt/kvm/async_pf.c +++ b/virt/kvm/async_pf.c | |||
| @@ -82,7 +82,7 @@ static void async_pf_execute(struct work_struct *work) | |||
| 82 | might_sleep(); | 82 | might_sleep(); |
| 83 | 83 | ||
| 84 | /* | 84 | /* |
| 85 | * This work is run asynchromously to the task which owns | 85 | * This work is run asynchronously to the task which owns |
| 86 | * mm and might be done in another context, so we must | 86 | * mm and might be done in another context, so we must |
| 87 | * access remotely. | 87 | * access remotely. |
| 88 | */ | 88 | */ |
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 2679e476b6c3..cf7cc0554094 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c | |||
| @@ -354,7 +354,10 @@ static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn, | |||
| 354 | idx = srcu_read_lock(&kvm->srcu); | 354 | idx = srcu_read_lock(&kvm->srcu); |
| 355 | spin_lock(&kvm->mmu_lock); | 355 | spin_lock(&kvm->mmu_lock); |
| 356 | kvm->mmu_notifier_seq++; | 356 | kvm->mmu_notifier_seq++; |
| 357 | kvm_set_spte_hva(kvm, address, pte); | 357 | |
| 358 | if (kvm_set_spte_hva(kvm, address, pte)) | ||
| 359 | kvm_flush_remote_tlbs(kvm); | ||
| 360 | |||
| 358 | spin_unlock(&kvm->mmu_lock); | 361 | spin_unlock(&kvm->mmu_lock); |
| 359 | srcu_read_unlock(&kvm->srcu, idx); | 362 | srcu_read_unlock(&kvm->srcu, idx); |
| 360 | } | 363 | } |
| @@ -1133,7 +1136,7 @@ EXPORT_SYMBOL_GPL(kvm_get_dirty_log); | |||
| 1133 | #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT | 1136 | #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT |
| 1134 | /** | 1137 | /** |
| 1135 | * kvm_get_dirty_log_protect - get a snapshot of dirty pages, and if any pages | 1138 | * kvm_get_dirty_log_protect - get a snapshot of dirty pages, and if any pages |
| 1136 | * are dirty write protect them for next write. | 1139 | * and reenable dirty page tracking for the corresponding pages. |
| 1137 | * @kvm: pointer to kvm instance | 1140 | * @kvm: pointer to kvm instance |
| 1138 | * @log: slot id and address to which we copy the log | 1141 | * @log: slot id and address to which we copy the log |
| 1139 | * @is_dirty: flag set if any page is dirty | 1142 | * @is_dirty: flag set if any page is dirty |
| @@ -1154,7 +1157,7 @@ EXPORT_SYMBOL_GPL(kvm_get_dirty_log); | |||
| 1154 | * | 1157 | * |
| 1155 | */ | 1158 | */ |
| 1156 | int kvm_get_dirty_log_protect(struct kvm *kvm, | 1159 | int kvm_get_dirty_log_protect(struct kvm *kvm, |
| 1157 | struct kvm_dirty_log *log, bool *is_dirty) | 1160 | struct kvm_dirty_log *log, bool *flush) |
| 1158 | { | 1161 | { |
| 1159 | struct kvm_memslots *slots; | 1162 | struct kvm_memslots *slots; |
| 1160 | struct kvm_memory_slot *memslot; | 1163 | struct kvm_memory_slot *memslot; |
| @@ -1176,37 +1179,114 @@ int kvm_get_dirty_log_protect(struct kvm *kvm, | |||
| 1176 | return -ENOENT; | 1179 | return -ENOENT; |
| 1177 | 1180 | ||
| 1178 | n = kvm_dirty_bitmap_bytes(memslot); | 1181 | n = kvm_dirty_bitmap_bytes(memslot); |
| 1182 | *flush = false; | ||
| 1183 | if (kvm->manual_dirty_log_protect) { | ||
| 1184 | /* | ||
| 1185 | * Unlike kvm_get_dirty_log, we always return false in *flush, | ||
| 1186 | * because no flush is needed until KVM_CLEAR_DIRTY_LOG. There | ||
| 1187 | * is some code duplication between this function and | ||
| 1188 | * kvm_get_dirty_log, but hopefully all architecture | ||
| 1189 | * transition to kvm_get_dirty_log_protect and kvm_get_dirty_log | ||
| 1190 | * can be eliminated. | ||
| 1191 | */ | ||
| 1192 | dirty_bitmap_buffer = dirty_bitmap; | ||
| 1193 | } else { | ||
| 1194 | dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot); | ||
| 1195 | memset(dirty_bitmap_buffer, 0, n); | ||
| 1196 | |||
| 1197 | spin_lock(&kvm->mmu_lock); | ||
| 1198 | for (i = 0; i < n / sizeof(long); i++) { | ||
| 1199 | unsigned long mask; | ||
| 1200 | gfn_t offset; | ||
| 1201 | |||
| 1202 | if (!dirty_bitmap[i]) | ||
| 1203 | continue; | ||
| 1204 | |||
| 1205 | *flush = true; | ||
| 1206 | mask = xchg(&dirty_bitmap[i], 0); | ||
| 1207 | dirty_bitmap_buffer[i] = mask; | ||
| 1208 | |||
| 1209 | if (mask) { | ||
| 1210 | offset = i * BITS_PER_LONG; | ||
| 1211 | kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot, | ||
| 1212 | offset, mask); | ||
| 1213 | } | ||
| 1214 | } | ||
| 1215 | spin_unlock(&kvm->mmu_lock); | ||
| 1216 | } | ||
| 1217 | |||
| 1218 | if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n)) | ||
| 1219 | return -EFAULT; | ||
| 1220 | return 0; | ||
| 1221 | } | ||
| 1222 | EXPORT_SYMBOL_GPL(kvm_get_dirty_log_protect); | ||
| 1179 | 1223 | ||
| 1224 | /** | ||
| 1225 | * kvm_clear_dirty_log_protect - clear dirty bits in the bitmap | ||
| 1226 | * and reenable dirty page tracking for the corresponding pages. | ||
| 1227 | * @kvm: pointer to kvm instance | ||
| 1228 | * @log: slot id and address from which to fetch the bitmap of dirty pages | ||
| 1229 | */ | ||
| 1230 | int kvm_clear_dirty_log_protect(struct kvm *kvm, | ||
| 1231 | struct kvm_clear_dirty_log *log, bool *flush) | ||
| 1232 | { | ||
| 1233 | struct kvm_memslots *slots; | ||
| 1234 | struct kvm_memory_slot *memslot; | ||
| 1235 | int as_id, id, n; | ||
| 1236 | gfn_t offset; | ||
| 1237 | unsigned long i; | ||
| 1238 | unsigned long *dirty_bitmap; | ||
| 1239 | unsigned long *dirty_bitmap_buffer; | ||
| 1240 | |||
| 1241 | as_id = log->slot >> 16; | ||
| 1242 | id = (u16)log->slot; | ||
| 1243 | if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS) | ||
| 1244 | return -EINVAL; | ||
| 1245 | |||
| 1246 | if ((log->first_page & 63) || (log->num_pages & 63)) | ||
| 1247 | return -EINVAL; | ||
| 1248 | |||
| 1249 | slots = __kvm_memslots(kvm, as_id); | ||
| 1250 | memslot = id_to_memslot(slots, id); | ||
| 1251 | |||
| 1252 | dirty_bitmap = memslot->dirty_bitmap; | ||
| 1253 | if (!dirty_bitmap) | ||
| 1254 | return -ENOENT; | ||
| 1255 | |||
| 1256 | n = kvm_dirty_bitmap_bytes(memslot); | ||
| 1257 | *flush = false; | ||
| 1180 | dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot); | 1258 | dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot); |
| 1181 | memset(dirty_bitmap_buffer, 0, n); | 1259 | if (copy_from_user(dirty_bitmap_buffer, log->dirty_bitmap, n)) |
| 1260 | return -EFAULT; | ||
| 1182 | 1261 | ||
| 1183 | spin_lock(&kvm->mmu_lock); | 1262 | spin_lock(&kvm->mmu_lock); |
| 1184 | *is_dirty = false; | 1263 | for (offset = log->first_page, |
| 1185 | for (i = 0; i < n / sizeof(long); i++) { | 1264 | i = offset / BITS_PER_LONG, n = log->num_pages / BITS_PER_LONG; n--; |
| 1186 | unsigned long mask; | 1265 | i++, offset += BITS_PER_LONG) { |
| 1187 | gfn_t offset; | 1266 | unsigned long mask = *dirty_bitmap_buffer++; |
| 1188 | 1267 | atomic_long_t *p = (atomic_long_t *) &dirty_bitmap[i]; | |
| 1189 | if (!dirty_bitmap[i]) | 1268 | if (!mask) |
| 1190 | continue; | 1269 | continue; |
| 1191 | 1270 | ||
| 1192 | *is_dirty = true; | 1271 | mask &= atomic_long_fetch_andnot(mask, p); |
| 1193 | |||
| 1194 | mask = xchg(&dirty_bitmap[i], 0); | ||
| 1195 | dirty_bitmap_buffer[i] = mask; | ||
| 1196 | 1272 | ||
| 1273 | /* | ||
| 1274 | * mask contains the bits that really have been cleared. This | ||
| 1275 | * never includes any bits beyond the length of the memslot (if | ||
| 1276 | * the length is not aligned to 64 pages), therefore it is not | ||
| 1277 | * a problem if userspace sets them in log->dirty_bitmap. | ||
| 1278 | */ | ||
| 1197 | if (mask) { | 1279 | if (mask) { |
| 1198 | offset = i * BITS_PER_LONG; | 1280 | *flush = true; |
| 1199 | kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot, | 1281 | kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot, |
| 1200 | offset, mask); | 1282 | offset, mask); |
| 1201 | } | 1283 | } |
| 1202 | } | 1284 | } |
| 1203 | |||
| 1204 | spin_unlock(&kvm->mmu_lock); | 1285 | spin_unlock(&kvm->mmu_lock); |
| 1205 | if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n)) | 1286 | |
| 1206 | return -EFAULT; | ||
| 1207 | return 0; | 1287 | return 0; |
| 1208 | } | 1288 | } |
| 1209 | EXPORT_SYMBOL_GPL(kvm_get_dirty_log_protect); | 1289 | EXPORT_SYMBOL_GPL(kvm_clear_dirty_log_protect); |
| 1210 | #endif | 1290 | #endif |
| 1211 | 1291 | ||
| 1212 | bool kvm_largepages_enabled(void) | 1292 | bool kvm_largepages_enabled(void) |
| @@ -1928,32 +2008,33 @@ static int __kvm_gfn_to_hva_cache_init(struct kvm_memslots *slots, | |||
| 1928 | gfn_t end_gfn = (gpa + len - 1) >> PAGE_SHIFT; | 2008 | gfn_t end_gfn = (gpa + len - 1) >> PAGE_SHIFT; |
| 1929 | gfn_t nr_pages_needed = end_gfn - start_gfn + 1; | 2009 | gfn_t nr_pages_needed = end_gfn - start_gfn + 1; |
| 1930 | gfn_t nr_pages_avail; | 2010 | gfn_t nr_pages_avail; |
| 2011 | int r = start_gfn <= end_gfn ? 0 : -EINVAL; | ||
| 1931 | 2012 | ||
| 1932 | ghc->gpa = gpa; | 2013 | ghc->gpa = gpa; |
| 1933 | ghc->generation = slots->generation; | 2014 | ghc->generation = slots->generation; |
| 1934 | ghc->len = len; | 2015 | ghc->len = len; |
| 1935 | ghc->memslot = __gfn_to_memslot(slots, start_gfn); | 2016 | ghc->hva = KVM_HVA_ERR_BAD; |
| 1936 | ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn, NULL); | 2017 | |
| 1937 | if (!kvm_is_error_hva(ghc->hva) && nr_pages_needed <= 1) { | 2018 | /* |
| 2019 | * If the requested region crosses two memslots, we still | ||
| 2020 | * verify that the entire region is valid here. | ||
| 2021 | */ | ||
| 2022 | while (!r && start_gfn <= end_gfn) { | ||
| 2023 | ghc->memslot = __gfn_to_memslot(slots, start_gfn); | ||
| 2024 | ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn, | ||
| 2025 | &nr_pages_avail); | ||
| 2026 | if (kvm_is_error_hva(ghc->hva)) | ||
| 2027 | r = -EFAULT; | ||
| 2028 | start_gfn += nr_pages_avail; | ||
| 2029 | } | ||
| 2030 | |||
| 2031 | /* Use the slow path for cross page reads and writes. */ | ||
| 2032 | if (!r && nr_pages_needed == 1) | ||
| 1938 | ghc->hva += offset; | 2033 | ghc->hva += offset; |
| 1939 | } else { | 2034 | else |
| 1940 | /* | ||
| 1941 | * If the requested region crosses two memslots, we still | ||
| 1942 | * verify that the entire region is valid here. | ||
| 1943 | */ | ||
| 1944 | while (start_gfn <= end_gfn) { | ||
| 1945 | nr_pages_avail = 0; | ||
| 1946 | ghc->memslot = __gfn_to_memslot(slots, start_gfn); | ||
| 1947 | ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn, | ||
| 1948 | &nr_pages_avail); | ||
| 1949 | if (kvm_is_error_hva(ghc->hva)) | ||
| 1950 | return -EFAULT; | ||
| 1951 | start_gfn += nr_pages_avail; | ||
| 1952 | } | ||
| 1953 | /* Use the slow path for cross page reads and writes. */ | ||
| 1954 | ghc->memslot = NULL; | 2035 | ghc->memslot = NULL; |
| 1955 | } | 2036 | |
| 1956 | return 0; | 2037 | return r; |
| 1957 | } | 2038 | } |
| 1958 | 2039 | ||
| 1959 | int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, | 2040 | int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, |
| @@ -1965,7 +2046,8 @@ int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, | |||
| 1965 | EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init); | 2046 | EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init); |
| 1966 | 2047 | ||
| 1967 | int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, | 2048 | int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, |
| 1968 | void *data, int offset, unsigned long len) | 2049 | void *data, unsigned int offset, |
| 2050 | unsigned long len) | ||
| 1969 | { | 2051 | { |
| 1970 | struct kvm_memslots *slots = kvm_memslots(kvm); | 2052 | struct kvm_memslots *slots = kvm_memslots(kvm); |
| 1971 | int r; | 2053 | int r; |
| @@ -2948,6 +3030,10 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) | |||
| 2948 | #endif | 3030 | #endif |
| 2949 | case KVM_CAP_IOEVENTFD_ANY_LENGTH: | 3031 | case KVM_CAP_IOEVENTFD_ANY_LENGTH: |
| 2950 | case KVM_CAP_CHECK_EXTENSION_VM: | 3032 | case KVM_CAP_CHECK_EXTENSION_VM: |
| 3033 | case KVM_CAP_ENABLE_CAP_VM: | ||
| 3034 | #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT | ||
| 3035 | case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT: | ||
| 3036 | #endif | ||
| 2951 | return 1; | 3037 | return 1; |
| 2952 | #ifdef CONFIG_KVM_MMIO | 3038 | #ifdef CONFIG_KVM_MMIO |
| 2953 | case KVM_CAP_COALESCED_MMIO: | 3039 | case KVM_CAP_COALESCED_MMIO: |
| @@ -2971,6 +3057,28 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) | |||
| 2971 | return kvm_vm_ioctl_check_extension(kvm, arg); | 3057 | return kvm_vm_ioctl_check_extension(kvm, arg); |
| 2972 | } | 3058 | } |
| 2973 | 3059 | ||
| 3060 | int __attribute__((weak)) kvm_vm_ioctl_enable_cap(struct kvm *kvm, | ||
| 3061 | struct kvm_enable_cap *cap) | ||
| 3062 | { | ||
| 3063 | return -EINVAL; | ||
| 3064 | } | ||
| 3065 | |||
| 3066 | static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm, | ||
| 3067 | struct kvm_enable_cap *cap) | ||
| 3068 | { | ||
| 3069 | switch (cap->cap) { | ||
| 3070 | #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT | ||
| 3071 | case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT: | ||
| 3072 | if (cap->flags || (cap->args[0] & ~1)) | ||
| 3073 | return -EINVAL; | ||
| 3074 | kvm->manual_dirty_log_protect = cap->args[0]; | ||
| 3075 | return 0; | ||
| 3076 | #endif | ||
| 3077 | default: | ||
| 3078 | return kvm_vm_ioctl_enable_cap(kvm, cap); | ||
| 3079 | } | ||
| 3080 | } | ||
| 3081 | |||
| 2974 | static long kvm_vm_ioctl(struct file *filp, | 3082 | static long kvm_vm_ioctl(struct file *filp, |
| 2975 | unsigned int ioctl, unsigned long arg) | 3083 | unsigned int ioctl, unsigned long arg) |
| 2976 | { | 3084 | { |
| @@ -2984,6 +3092,15 @@ static long kvm_vm_ioctl(struct file *filp, | |||
| 2984 | case KVM_CREATE_VCPU: | 3092 | case KVM_CREATE_VCPU: |
| 2985 | r = kvm_vm_ioctl_create_vcpu(kvm, arg); | 3093 | r = kvm_vm_ioctl_create_vcpu(kvm, arg); |
| 2986 | break; | 3094 | break; |
| 3095 | case KVM_ENABLE_CAP: { | ||
| 3096 | struct kvm_enable_cap cap; | ||
| 3097 | |||
| 3098 | r = -EFAULT; | ||
| 3099 | if (copy_from_user(&cap, argp, sizeof(cap))) | ||
| 3100 | goto out; | ||
| 3101 | r = kvm_vm_ioctl_enable_cap_generic(kvm, &cap); | ||
| 3102 | break; | ||
| 3103 | } | ||
| 2987 | case KVM_SET_USER_MEMORY_REGION: { | 3104 | case KVM_SET_USER_MEMORY_REGION: { |
| 2988 | struct kvm_userspace_memory_region kvm_userspace_mem; | 3105 | struct kvm_userspace_memory_region kvm_userspace_mem; |
| 2989 | 3106 | ||
| @@ -3004,6 +3121,17 @@ static long kvm_vm_ioctl(struct file *filp, | |||
| 3004 | r = kvm_vm_ioctl_get_dirty_log(kvm, &log); | 3121 | r = kvm_vm_ioctl_get_dirty_log(kvm, &log); |
| 3005 | break; | 3122 | break; |
| 3006 | } | 3123 | } |
| 3124 | #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT | ||
| 3125 | case KVM_CLEAR_DIRTY_LOG: { | ||
| 3126 | struct kvm_clear_dirty_log log; | ||
| 3127 | |||
| 3128 | r = -EFAULT; | ||
| 3129 | if (copy_from_user(&log, argp, sizeof(log))) | ||
| 3130 | goto out; | ||
| 3131 | r = kvm_vm_ioctl_clear_dirty_log(kvm, &log); | ||
| 3132 | break; | ||
| 3133 | } | ||
| 3134 | #endif | ||
| 3007 | #ifdef CONFIG_KVM_MMIO | 3135 | #ifdef CONFIG_KVM_MMIO |
| 3008 | case KVM_REGISTER_COALESCED_MMIO: { | 3136 | case KVM_REGISTER_COALESCED_MMIO: { |
| 3009 | struct kvm_coalesced_mmio_zone zone; | 3137 | struct kvm_coalesced_mmio_zone zone; |
