1 files changed, 145 insertions, 58 deletions
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index f85815945fc6..9120ae1901e4 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -42,6 +42,7 @@
 #include <asm/i387.h>
 #include <asm/xcr.h>
 #include <asm/perf_event.h>
+#include <asm/kexec.h>
 #include "trace.h"
@@ -802,11 +803,6 @@ static inline bool cpu_has_vmx_ept_ad_bits(void)
        return vmx_capability.ept & VMX_EPT_AD_BIT;
 }
-static inline bool cpu_has_vmx_invept_individual_addr(void)
-{
-        return vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT;
-}
 static inline bool cpu_has_vmx_invept_context(void)
 {
        return vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT;
@@ -992,6 +988,46 @@ static void vmcs_load(struct vmcs *vmcs)
                       vmcs, phys_addr);
 }
+#ifdef CONFIG_KEXEC
+/*
+ * This bitmap is used to indicate whether the vmclear
+ * operation is enabled on all cpus. All disabled by
+ * default.
+ */
+static cpumask_t crash_vmclear_enabled_bitmap = CPU_MASK_NONE;
+static inline void crash_enable_local_vmclear(int cpu)
+{
+        cpumask_set_cpu(cpu, &crash_vmclear_enabled_bitmap);
+}
+static inline void crash_disable_local_vmclear(int cpu)
+{
+        cpumask_clear_cpu(cpu, &crash_vmclear_enabled_bitmap);
+}
+static inline int crash_local_vmclear_enabled(int cpu)
+{
+        return cpumask_test_cpu(cpu, &crash_vmclear_enabled_bitmap);
+}
+static void crash_vmclear_local_loaded_vmcss(void)
+{
+        int cpu = raw_smp_processor_id();
+        struct loaded_vmcs *v;
+        if (!crash_local_vmclear_enabled(cpu))
+                return;
+        list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),
+                            loaded_vmcss_on_cpu_link)
+                vmcs_clear(v->vmcs);
+}
+#else
+static inline void crash_enable_local_vmclear(int cpu) { }
+static inline void crash_disable_local_vmclear(int cpu) { }
+#endif /* CONFIG_KEXEC */
 static void __loaded_vmcs_clear(void *arg)
 {
        struct loaded_vmcs *loaded_vmcs = arg;
@@ -1001,15 +1037,28 @@ static void __loaded_vmcs_clear(void *arg)
                return; /* vcpu migration can race with cpu offline */
        if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
                per_cpu(current_vmcs, cpu) = NULL;
+        crash_disable_local_vmclear(cpu);
        list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);
+        /*
+         * we should ensure updating loaded_vmcs->loaded_vmcss_on_cpu_link
+         * is before setting loaded_vmcs->vcpu to -1 which is done in
+         * loaded_vmcs_init. Otherwise, other cpu can see vcpu = -1 fist
+         * then adds the vmcs into percpu list before it is deleted.
+         */
+        smp_wmb();
        loaded_vmcs_init(loaded_vmcs);
+        crash_enable_local_vmclear(cpu);
 }
 static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
 {
-        if (loaded_vmcs->cpu != -1)
+        int cpu = loaded_vmcs->cpu;
-                smp_call_function_single(
-                        loaded_vmcs->cpu, __loaded_vmcs_clear, loaded_vmcs, 1);
+        if (cpu != -1)
+                smp_call_function_single(cpu,
+                         __loaded_vmcs_clear, loaded_vmcs, 1);
 }
 static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx)
@@ -1051,17 +1100,6 @@ static inline void ept_sync_context(u64 eptp)
        }
 }
-static inline void ept_sync_individual_addr(u64 eptp, gpa_t gpa)
-{
-        if (enable_ept) {
-                if (cpu_has_vmx_invept_individual_addr())
-                        __invept(VMX_EPT_EXTENT_INDIVIDUAL_ADDR,
-                                        eptp, gpa);
-                else
-                        ept_sync_context(eptp);
-        }
-}
 static __always_inline unsigned long vmcs_readl(unsigned long field)
 {
        unsigned long value;
@@ -1535,8 +1573,18 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
                kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
                local_irq_disable();
+                crash_disable_local_vmclear(cpu);
+                /*
+                 * Read loaded_vmcs->cpu should be before fetching
+                 * loaded_vmcs->loaded_vmcss_on_cpu_link.
+                 * See the comments in __loaded_vmcs_clear().
+                 */
+                smp_rmb();
                list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
                         &per_cpu(loaded_vmcss_on_cpu, cpu));
+                crash_enable_local_vmclear(cpu);
                local_irq_enable();
                /*
@@ -1839,11 +1887,10 @@ static u64 guest_read_tsc(void)
 * Like guest_read_tsc, but always returns L1's notion of the timestamp
 * counter, even if a nested guest (L2) is currently running.
 */
-u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu)
+u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
 {
-        u64 host_tsc, tsc_offset;
+        u64 tsc_offset;
-        rdtscll(host_tsc);
        tsc_offset = is_guest_mode(vcpu) ?
                to_vmx(vcpu)->nested.vmcs01_tsc_offset :
                vmcs_read64(TSC_OFFSET);
@@ -1866,6 +1913,11 @@ static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
                WARN(1, "user requested TSC rate below hardware speed\n");
 }
+static u64 vmx_read_tsc_offset(struct kvm_vcpu *vcpu)
+{
+        return vmcs_read64(TSC_OFFSET);
+}
 /*
 * writes 'offset' into guest's timestamp counter offset register
 */
@@ -2202,15 +2254,17 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
 * Returns 0 on success, non-0 otherwise.
 * Assumes vcpu_load() was already called.
 */
-static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
+static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        struct shared_msr_entry *msr;
        int ret = 0;
+        u32 msr_index = msr_info->index;
+        u64 data = msr_info->data;
        switch (msr_index) {
        case MSR_EFER:
-                ret = kvm_set_msr_common(vcpu, msr_index, data);
+                ret = kvm_set_msr_common(vcpu, msr_info);
                break;
 #ifdef CONFIG_X86_64
        case MSR_FS_BASE:
@@ -2236,7 +2290,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
                vmcs_writel(GUEST_SYSENTER_ESP, data);
                break;
        case MSR_IA32_TSC:
-                kvm_write_tsc(vcpu, data);
+                kvm_write_tsc(vcpu, msr_info);
                break;
        case MSR_IA32_CR_PAT:
                if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
@@ -2244,7 +2298,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
                        vcpu->arch.pat = data;
                        break;
                }
-                ret = kvm_set_msr_common(vcpu, msr_index, data);
+                ret = kvm_set_msr_common(vcpu, msr_info);
+                break;
+        case MSR_IA32_TSC_ADJUST:
+                ret = kvm_set_msr_common(vcpu, msr_info);
                break;
        case MSR_TSC_AUX:
                if (!vmx->rdtscp_enabled)
@@ -2267,7 +2324,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
                        }
                        break;
                }
-                ret = kvm_set_msr_common(vcpu, msr_index, data);
+                ret = kvm_set_msr_common(vcpu, msr_info);
        }
        return ret;
@@ -2341,6 +2398,18 @@ static int hardware_enable(void *garbage)
                return -EBUSY;
        INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
+        /*
+         * Now we can enable the vmclear operation in kdump
+         * since the loaded_vmcss_on_cpu list on this cpu
+         * has been initialized.
+         *
+         * Though the cpu is not in VMX operation now, there
+         * is no problem to enable the vmclear operation
+         * for the loaded_vmcss_on_cpu list is empty!
+         */
+        crash_enable_local_vmclear(cpu);
        rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
        test_bits = FEATURE_CONTROL_LOCKED;
@@ -2697,6 +2766,7 @@ static void fix_pmode_dataseg(struct kvm_vcpu *vcpu, int seg, struct kvm_segment
        if (!(vmcs_readl(sf->base) == tmp.base && tmp.s)) {
                tmp.base = vmcs_readl(sf->base);
                tmp.selector = vmcs_read16(sf->selector);
+                tmp.dpl = tmp.selector & SELECTOR_RPL_MASK;
                tmp.s = 1;
        }
        vmx_set_segment(vcpu, &tmp, seg);
@@ -3246,7 +3316,7 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
         * unrestricted guest like Westmere to older host that don't have
         * unrestricted guest like Nehelem.
         */
-        if (!enable_unrestricted_guest && vmx->rmode.vm86_active) {
+        if (vmx->rmode.vm86_active) {
                switch (seg) {
                case VCPU_SREG_CS:
                        vmcs_write32(GUEST_CS_AR_BYTES, 0xf3);
@@ -3897,8 +3967,6 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
        vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
        set_cr4_guest_host_mask(vmx);
-        kvm_write_tsc(&vmx->vcpu, 0);
        return 0;
 }
@@ -3908,8 +3976,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
        u64 msr;
        int ret;
-        vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP));
        vmx->rmode.vm86_active = 0;
        vmx->soft_vnmi_blocked = 0;
@@ -3921,10 +3987,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
                msr |= MSR_IA32_APICBASE_BSP;
        kvm_set_apic_base(&vmx->vcpu, msr);
-        ret = fx_init(&vmx->vcpu);
-        if (ret != 0)
-                goto out;
        vmx_segment_cache_clear(vmx);
        seg_setup(VCPU_SREG_CS);
@@ -3965,7 +4027,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
                kvm_rip_write(vcpu, 0xfff0);
        else
                kvm_rip_write(vcpu, 0);
-        kvm_register_write(vcpu, VCPU_REGS_RSP, 0);
        vmcs_writel(GUEST_GDTR_BASE, 0);
        vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
@@ -4015,7 +4076,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
        /* HACK: Don't enable emulation on guest boot/reset */
        vmx->emulation_required = 0;
-out:
        return ret;
 }
@@ -4287,16 +4347,6 @@ static int handle_exception(struct kvm_vcpu *vcpu)
        if (is_machine_check(intr_info))
                return handle_machine_check(vcpu);
-        if ((vect_info & VECTORING_INFO_VALID_MASK) &&
-            !is_page_fault(intr_info)) {
-                vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
-                vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX;
-                vcpu->run->internal.ndata = 2;
-                vcpu->run->internal.data[0] = vect_info;
-                vcpu->run->internal.data[1] = intr_info;
-                return 0;
-        }
        if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR)
                return 1;  /* already handled by vmx_vcpu_run() */
@@ -4315,6 +4365,22 @@ static int handle_exception(struct kvm_vcpu *vcpu)
        error_code = 0;
        if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
                error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
+        /*
+         * The #PF with PFEC.RSVD = 1 indicates the guest is accessing
+         * MMIO, it is better to report an internal error.
+         * See the comments in vmx_handle_exit.
+         */
+        if ((vect_info & VECTORING_INFO_VALID_MASK) &&
+            !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) {
+                vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+                vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX;
+                vcpu->run->internal.ndata = 2;
+                vcpu->run->internal.data[0] = vect_info;
+                vcpu->run->internal.data[1] = intr_info;
+                return 0;
+        }
        if (is_page_fault(intr_info)) {
                /* EPT won't cause page fault directly */
                BUG_ON(enable_ept);
@@ -4626,11 +4692,15 @@ static int handle_rdmsr(struct kvm_vcpu *vcpu)
 static int handle_wrmsr(struct kvm_vcpu *vcpu)
 {
+        struct msr_data msr;
        u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
        u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u)
                | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32);
-        if (vmx_set_msr(vcpu, ecx, data) != 0) {
+        msr.data = data;
+        msr.index = ecx;
+        msr.host_initiated = false;
+        if (vmx_set_msr(vcpu, &msr) != 0) {
                trace_kvm_msr_write_ex(ecx, data);
                kvm_inject_gp(vcpu, 0);
                return 1;
@@ -4827,11 +4897,6 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
        exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
-        if (exit_qualification & (1 << 6)) {
-                printk(KERN_ERR "EPT: GPA exceeds GAW!\n");
-                return -EINVAL;
-        }
        gla_validity = (exit_qualification >> 7) & 0x3;
        if (gla_validity != 0x3 && gla_validity != 0x1 && gla_validity != 0) {
                printk(KERN_ERR "EPT: Handling EPT violation failed!\n");
@@ -5979,13 +6044,24 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
                return 0;
        }
+        /*
+         * Note:
+         * Do not try to fix EXIT_REASON_EPT_MISCONFIG if it caused by
+         * delivery event since it indicates guest is accessing MMIO.
+         * The vm-exit can be triggered again after return to guest that
+         * will cause infinite loop.
+         */
        if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
                        (exit_reason != EXIT_REASON_EXCEPTION_NMI &&
                        exit_reason != EXIT_REASON_EPT_VIOLATION &&
-                        exit_reason != EXIT_REASON_TASK_SWITCH))
+                        exit_reason != EXIT_REASON_TASK_SWITCH)) {
-                printk(KERN_WARNING "%s: unexpected, valid vectoring info "
+                vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
-                       "(0x%x) and exit reason is 0x%x\n",
+                vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV;
-                       __func__, vectoring_info, exit_reason);
+                vcpu->run->internal.ndata = 2;
+                vcpu->run->internal.data[0] = vectoring_info;
+                vcpu->run->internal.data[1] = exit_reason;
+                return 0;
+        }
        if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked &&
            !(is_guest_mode(vcpu) && nested_cpu_has_virtual_nmis(
@@ -7309,6 +7385,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
        .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
        .set_tsc_khz = vmx_set_tsc_khz,
+        .read_tsc_offset = vmx_read_tsc_offset,
        .write_tsc_offset = vmx_write_tsc_offset,
        .adjust_tsc_offset = vmx_adjust_tsc_offset,
        .compute_tsc_offset = vmx_compute_tsc_offset,
@@ -7367,6 +7444,11 @@ static int __init vmx_init(void)
        if (r)
                goto out3;
+#ifdef CONFIG_KEXEC
+        rcu_assign_pointer(crash_vmclear_loaded_vmcss,
+                           crash_vmclear_local_loaded_vmcss);
+#endif
        vmx_disable_intercept_for_msr(MSR_FS_BASE, false);
        vmx_disable_intercept_for_msr(MSR_GS_BASE, false);
        vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true);
@@ -7404,6 +7486,11 @@ static void __exit vmx_exit(void)
        free_page((unsigned long)vmx_io_bitmap_b);
        free_page((unsigned long)vmx_io_bitmap_a);
+#ifdef CONFIG_KEXEC
+        rcu_assign_pointer(crash_vmclear_loaded_vmcss, NULL);
+        synchronize_rcu();
+#endif
        kvm_exit();
 }

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index f85815945fc6..9120ae1901e4 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c
@@ -42,6 +42,7 @@
42	#include <asm/i387.h>	42	#include <asm/i387.h>
43	#include <asm/xcr.h>	43	#include <asm/xcr.h>
44	#include <asm/perf_event.h>	44	#include <asm/perf_event.h>
		45	#include <asm/kexec.h>
45		46
46	#include "trace.h"	47	#include "trace.h"
47		48
@@ -802,11 +803,6 @@ static inline bool cpu_has_vmx_ept_ad_bits(void)
802	return vmx_capability.ept & VMX_EPT_AD_BIT;	803	return vmx_capability.ept & VMX_EPT_AD_BIT;
803	}	804	}
804		805
805	static inline bool cpu_has_vmx_invept_individual_addr(void)
806	{
807	return vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT;
808	}
809
810	static inline bool cpu_has_vmx_invept_context(void)	806	static inline bool cpu_has_vmx_invept_context(void)
811	{	807	{
812	return vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT;	808	return vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT;
@@ -992,6 +988,46 @@ static void vmcs_load(struct vmcs *vmcs)
992	vmcs, phys_addr);	988	vmcs, phys_addr);
993	}	989	}
994		990
		991	#ifdef CONFIG_KEXEC
		992	/*
		993	* This bitmap is used to indicate whether the vmclear
		994	* operation is enabled on all cpus. All disabled by
		995	* default.
		996	*/
		997	static cpumask_t crash_vmclear_enabled_bitmap = CPU_MASK_NONE;
		998
		999	static inline void crash_enable_local_vmclear(int cpu)
		1000	{
		1001	cpumask_set_cpu(cpu, &crash_vmclear_enabled_bitmap);
		1002	}
		1003
		1004	static inline void crash_disable_local_vmclear(int cpu)
		1005	{
		1006	cpumask_clear_cpu(cpu, &crash_vmclear_enabled_bitmap);
		1007	}
		1008
		1009	static inline int crash_local_vmclear_enabled(int cpu)
		1010	{
		1011	return cpumask_test_cpu(cpu, &crash_vmclear_enabled_bitmap);
		1012	}
		1013
		1014	static void crash_vmclear_local_loaded_vmcss(void)
		1015	{
		1016	int cpu = raw_smp_processor_id();
		1017	struct loaded_vmcs *v;
		1018
		1019	if (!crash_local_vmclear_enabled(cpu))
		1020	return;
		1021
		1022	list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),
		1023	loaded_vmcss_on_cpu_link)
		1024	vmcs_clear(v->vmcs);
		1025	}
		1026	#else
		1027	static inline void crash_enable_local_vmclear(int cpu) { }
		1028	static inline void crash_disable_local_vmclear(int cpu) { }
		1029	#endif /* CONFIG_KEXEC */
		1030
995	static void __loaded_vmcs_clear(void *arg)	1031	static void __loaded_vmcs_clear(void *arg)
996	{	1032	{
997	struct loaded_vmcs *loaded_vmcs = arg;	1033	struct loaded_vmcs *loaded_vmcs = arg;
@@ -1001,15 +1037,28 @@ static void __loaded_vmcs_clear(void *arg)
1001	return; /* vcpu migration can race with cpu offline */	1037	return; /* vcpu migration can race with cpu offline */
1002	if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)	1038	if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
1003	per_cpu(current_vmcs, cpu) = NULL;	1039	per_cpu(current_vmcs, cpu) = NULL;
		1040	crash_disable_local_vmclear(cpu);
1004	list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);	1041	list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);
		1042
		1043	/*
		1044	* we should ensure updating loaded_vmcs->loaded_vmcss_on_cpu_link
		1045	* is before setting loaded_vmcs->vcpu to -1 which is done in
		1046	* loaded_vmcs_init. Otherwise, other cpu can see vcpu = -1 fist
		1047	* then adds the vmcs into percpu list before it is deleted.
		1048	*/
		1049	smp_wmb();
		1050
1005	loaded_vmcs_init(loaded_vmcs);	1051	loaded_vmcs_init(loaded_vmcs);
		1052	crash_enable_local_vmclear(cpu);
1006	}	1053	}
1007		1054
1008	static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)	1055	static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
1009	{	1056	{
1010	if (loaded_vmcs->cpu != -1)	1057	int cpu = loaded_vmcs->cpu;
1011	smp_call_function_single(	1058
1012	loaded_vmcs->cpu, __loaded_vmcs_clear, loaded_vmcs, 1);	1059	if (cpu != -1)
		1060	smp_call_function_single(cpu,
		1061	__loaded_vmcs_clear, loaded_vmcs, 1);
1013	}	1062	}
1014		1063
1015	static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx)	1064	static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx)
@@ -1051,17 +1100,6 @@ static inline void ept_sync_context(u64 eptp)
1051	}	1100	}
1052	}	1101	}
1053		1102
1054	static inline void ept_sync_individual_addr(u64 eptp, gpa_t gpa)
1055	{
1056	if (enable_ept) {
1057	if (cpu_has_vmx_invept_individual_addr())
1058	__invept(VMX_EPT_EXTENT_INDIVIDUAL_ADDR,
1059	eptp, gpa);
1060	else
1061	ept_sync_context(eptp);
1062	}
1063	}
1064
1065	static __always_inline unsigned long vmcs_readl(unsigned long field)	1103	static __always_inline unsigned long vmcs_readl(unsigned long field)
1066	{	1104	{
1067	unsigned long value;	1105	unsigned long value;
@@ -1535,8 +1573,18 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1535		1573
1536	kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);	1574	kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
1537	local_irq_disable();	1575	local_irq_disable();
		1576	crash_disable_local_vmclear(cpu);
		1577
		1578	/*
		1579	* Read loaded_vmcs->cpu should be before fetching
		1580	* loaded_vmcs->loaded_vmcss_on_cpu_link.
		1581	* See the comments in __loaded_vmcs_clear().
		1582	*/
		1583	smp_rmb();
		1584
1538	list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,	1585	list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
1539	&per_cpu(loaded_vmcss_on_cpu, cpu));	1586	&per_cpu(loaded_vmcss_on_cpu, cpu));
		1587	crash_enable_local_vmclear(cpu);
1540	local_irq_enable();	1588	local_irq_enable();
1541		1589
1542	/*	1590	/*
@@ -1839,11 +1887,10 @@ static u64 guest_read_tsc(void)
1839	* Like guest_read_tsc, but always returns L1's notion of the timestamp	1887	* Like guest_read_tsc, but always returns L1's notion of the timestamp
1840	* counter, even if a nested guest (L2) is currently running.	1888	* counter, even if a nested guest (L2) is currently running.
1841	*/	1889	*/
1842	u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu)	1890	u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
1843	{	1891	{
1844	u64 host_tsc, tsc_offset;	1892	u64 tsc_offset;
1845		1893
1846	rdtscll(host_tsc);
1847	tsc_offset = is_guest_mode(vcpu) ?	1894	tsc_offset = is_guest_mode(vcpu) ?
1848	to_vmx(vcpu)->nested.vmcs01_tsc_offset :	1895	to_vmx(vcpu)->nested.vmcs01_tsc_offset :
1849	vmcs_read64(TSC_OFFSET);	1896	vmcs_read64(TSC_OFFSET);
@@ -1866,6 +1913,11 @@ static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
1866	WARN(1, "user requested TSC rate below hardware speed\n");	1913	WARN(1, "user requested TSC rate below hardware speed\n");
1867	}	1914	}
1868		1915
		1916	static u64 vmx_read_tsc_offset(struct kvm_vcpu *vcpu)
		1917	{
		1918	return vmcs_read64(TSC_OFFSET);
		1919	}
		1920
1869	/*	1921	/*
1870	* writes 'offset' into guest's timestamp counter offset register	1922	* writes 'offset' into guest's timestamp counter offset register
1871	*/	1923	*/
@@ -2202,15 +2254,17 @@ static int vmx_get_msr(struct kvm_vcpu vcpu, u32 msr_index, u64 pdata)
2202	* Returns 0 on success, non-0 otherwise.	2254	* Returns 0 on success, non-0 otherwise.
2203	* Assumes vcpu_load() was already called.	2255	* Assumes vcpu_load() was already called.
2204	*/	2256	*/
2205	static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)	2257	static int vmx_set_msr(struct kvm_vcpu vcpu, struct msr_data msr_info)
2206	{	2258	{
2207	struct vcpu_vmx *vmx = to_vmx(vcpu);	2259	struct vcpu_vmx *vmx = to_vmx(vcpu);
2208	struct shared_msr_entry *msr;	2260	struct shared_msr_entry *msr;
2209	int ret = 0;	2261	int ret = 0;
		2262	u32 msr_index = msr_info->index;
		2263	u64 data = msr_info->data;
2210		2264
2211	switch (msr_index) {	2265	switch (msr_index) {
2212	case MSR_EFER:	2266	case MSR_EFER:
2213	ret = kvm_set_msr_common(vcpu, msr_index, data);	2267	ret = kvm_set_msr_common(vcpu, msr_info);
2214	break;	2268	break;
2215	#ifdef CONFIG_X86_64	2269	#ifdef CONFIG_X86_64
2216	case MSR_FS_BASE:	2270	case MSR_FS_BASE:
@@ -2236,7 +2290,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
2236	vmcs_writel(GUEST_SYSENTER_ESP, data);	2290	vmcs_writel(GUEST_SYSENTER_ESP, data);
2237	break;	2291	break;
2238	case MSR_IA32_TSC:	2292	case MSR_IA32_TSC:
2239	kvm_write_tsc(vcpu, data);	2293	kvm_write_tsc(vcpu, msr_info);
2240	break;	2294	break;
2241	case MSR_IA32_CR_PAT:	2295	case MSR_IA32_CR_PAT:
2242	if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {	2296	if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
@@ -2244,7 +2298,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
2244	vcpu->arch.pat = data;	2298	vcpu->arch.pat = data;
2245	break;	2299	break;
2246	}	2300	}
2247	ret = kvm_set_msr_common(vcpu, msr_index, data);	2301	ret = kvm_set_msr_common(vcpu, msr_info);
		2302	break;
		2303	case MSR_IA32_TSC_ADJUST:
		2304	ret = kvm_set_msr_common(vcpu, msr_info);
2248	break;	2305	break;
2249	case MSR_TSC_AUX:	2306	case MSR_TSC_AUX:
2250	if (!vmx->rdtscp_enabled)	2307	if (!vmx->rdtscp_enabled)
@@ -2267,7 +2324,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
2267	}	2324	}
2268	break;	2325	break;
2269	}	2326	}
2270	ret = kvm_set_msr_common(vcpu, msr_index, data);	2327	ret = kvm_set_msr_common(vcpu, msr_info);
2271	}	2328	}
2272		2329
2273	return ret;	2330	return ret;
@@ -2341,6 +2398,18 @@ static int hardware_enable(void *garbage)
2341	return -EBUSY;	2398	return -EBUSY;
2342		2399
2343	INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));	2400	INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
		2401
		2402	/*
		2403	* Now we can enable the vmclear operation in kdump
		2404	* since the loaded_vmcss_on_cpu list on this cpu
		2405	* has been initialized.
		2406	*
		2407	* Though the cpu is not in VMX operation now, there
		2408	* is no problem to enable the vmclear operation
		2409	* for the loaded_vmcss_on_cpu list is empty!
		2410	*/
		2411	crash_enable_local_vmclear(cpu);
		2412
2344	rdmsrl(MSR_IA32_FEATURE_CONTROL, old);	2413	rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
2345		2414
2346	test_bits = FEATURE_CONTROL_LOCKED;	2415	test_bits = FEATURE_CONTROL_LOCKED;
@@ -2697,6 +2766,7 @@ static void fix_pmode_dataseg(struct kvm_vcpu *vcpu, int seg, struct kvm_segment
2697	if (!(vmcs_readl(sf->base) == tmp.base && tmp.s)) {	2766	if (!(vmcs_readl(sf->base) == tmp.base && tmp.s)) {
2698	tmp.base = vmcs_readl(sf->base);	2767	tmp.base = vmcs_readl(sf->base);
2699	tmp.selector = vmcs_read16(sf->selector);	2768	tmp.selector = vmcs_read16(sf->selector);
		2769	tmp.dpl = tmp.selector & SELECTOR_RPL_MASK;
2700	tmp.s = 1;	2770	tmp.s = 1;
2701	}	2771	}
2702	vmx_set_segment(vcpu, &tmp, seg);	2772	vmx_set_segment(vcpu, &tmp, seg);
@@ -3246,7 +3316,7 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
3246	* unrestricted guest like Westmere to older host that don't have	3316	* unrestricted guest like Westmere to older host that don't have
3247	* unrestricted guest like Nehelem.	3317	* unrestricted guest like Nehelem.
3248	*/	3318	*/
3249	if (!enable_unrestricted_guest && vmx->rmode.vm86_active) {	3319	if (vmx->rmode.vm86_active) {
3250	switch (seg) {	3320	switch (seg) {
3251	case VCPU_SREG_CS:	3321	case VCPU_SREG_CS:
3252	vmcs_write32(GUEST_CS_AR_BYTES, 0xf3);	3322	vmcs_write32(GUEST_CS_AR_BYTES, 0xf3);
@@ -3897,8 +3967,6 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
3897	vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);	3967	vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
3898	set_cr4_guest_host_mask(vmx);	3968	set_cr4_guest_host_mask(vmx);
3899		3969
3900	kvm_write_tsc(&vmx->vcpu, 0);
3901
3902	return 0;	3970	return 0;
3903	}	3971	}
3904		3972
@@ -3908,8 +3976,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
3908	u64 msr;	3976	u64 msr;
3909	int ret;	3977	int ret;
3910		3978
3911	vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) \| (1 << VCPU_REGS_RSP));
3912
3913	vmx->rmode.vm86_active = 0;	3979	vmx->rmode.vm86_active = 0;
3914		3980
3915	vmx->soft_vnmi_blocked = 0;	3981	vmx->soft_vnmi_blocked = 0;
@@ -3921,10 +3987,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
3921	msr \|= MSR_IA32_APICBASE_BSP;	3987	msr \|= MSR_IA32_APICBASE_BSP;
3922	kvm_set_apic_base(&vmx->vcpu, msr);	3988	kvm_set_apic_base(&vmx->vcpu, msr);
3923		3989
3924	ret = fx_init(&vmx->vcpu);
3925	if (ret != 0)
3926	goto out;
3927
3928	vmx_segment_cache_clear(vmx);	3990	vmx_segment_cache_clear(vmx);
3929		3991
3930	seg_setup(VCPU_SREG_CS);	3992	seg_setup(VCPU_SREG_CS);
@@ -3965,7 +4027,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
3965	kvm_rip_write(vcpu, 0xfff0);	4027	kvm_rip_write(vcpu, 0xfff0);
3966	else	4028	else
3967	kvm_rip_write(vcpu, 0);	4029	kvm_rip_write(vcpu, 0);
3968	kvm_register_write(vcpu, VCPU_REGS_RSP, 0);
3969		4030
3970	vmcs_writel(GUEST_GDTR_BASE, 0);	4031	vmcs_writel(GUEST_GDTR_BASE, 0);
3971	vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);	4032	vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
@@ -4015,7 +4076,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
4015	/* HACK: Don't enable emulation on guest boot/reset */	4076	/* HACK: Don't enable emulation on guest boot/reset */
4016	vmx->emulation_required = 0;	4077	vmx->emulation_required = 0;
4017		4078
4018	out:
4019	return ret;	4079	return ret;
4020	}	4080	}
4021		4081
@@ -4287,16 +4347,6 @@ static int handle_exception(struct kvm_vcpu *vcpu)
4287	if (is_machine_check(intr_info))	4347	if (is_machine_check(intr_info))
4288	return handle_machine_check(vcpu);	4348	return handle_machine_check(vcpu);
4289		4349
4290	if ((vect_info & VECTORING_INFO_VALID_MASK) &&
4291	!is_page_fault(intr_info)) {
4292	vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
4293	vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX;
4294	vcpu->run->internal.ndata = 2;
4295	vcpu->run->internal.data[0] = vect_info;
4296	vcpu->run->internal.data[1] = intr_info;
4297	return 0;
4298	}
4299
4300	if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR)	4350	if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR)
4301	return 1; /* already handled by vmx_vcpu_run() */	4351	return 1; /* already handled by vmx_vcpu_run() */
4302		4352
@@ -4315,6 +4365,22 @@ static int handle_exception(struct kvm_vcpu *vcpu)
4315	error_code = 0;	4365	error_code = 0;
4316	if (intr_info & INTR_INFO_DELIVER_CODE_MASK)	4366	if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
4317	error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);	4367	error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
		4368
		4369	/*
		4370	* The #PF with PFEC.RSVD = 1 indicates the guest is accessing
		4371	* MMIO, it is better to report an internal error.
		4372	* See the comments in vmx_handle_exit.
		4373	*/
		4374	if ((vect_info & VECTORING_INFO_VALID_MASK) &&
		4375	!(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) {
		4376	vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
		4377	vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX;
		4378	vcpu->run->internal.ndata = 2;
		4379	vcpu->run->internal.data[0] = vect_info;
		4380	vcpu->run->internal.data[1] = intr_info;
		4381	return 0;
		4382	}
		4383
4318	if (is_page_fault(intr_info)) {	4384	if (is_page_fault(intr_info)) {
4319	/* EPT won't cause page fault directly */	4385	/* EPT won't cause page fault directly */
4320	BUG_ON(enable_ept);	4386	BUG_ON(enable_ept);
@@ -4626,11 +4692,15 @@ static int handle_rdmsr(struct kvm_vcpu *vcpu)
4626		4692
4627	static int handle_wrmsr(struct kvm_vcpu *vcpu)	4693	static int handle_wrmsr(struct kvm_vcpu *vcpu)
4628	{	4694	{
		4695	struct msr_data msr;
4629	u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];	4696	u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
4630	u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u)	4697	u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u)
4631	\| ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32);	4698	\| ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32);
4632		4699
4633	if (vmx_set_msr(vcpu, ecx, data) != 0) {	4700	msr.data = data;
		4701	msr.index = ecx;
		4702	msr.host_initiated = false;
		4703	if (vmx_set_msr(vcpu, &msr) != 0) {
4634	trace_kvm_msr_write_ex(ecx, data);	4704	trace_kvm_msr_write_ex(ecx, data);
4635	kvm_inject_gp(vcpu, 0);	4705	kvm_inject_gp(vcpu, 0);
4636	return 1;	4706	return 1;
@@ -4827,11 +4897,6 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
4827		4897
4828	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);	4898	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
4829		4899
4830	if (exit_qualification & (1 << 6)) {
4831	printk(KERN_ERR "EPT: GPA exceeds GAW!\n");
4832	return -EINVAL;
4833	}
4834
4835	gla_validity = (exit_qualification >> 7) & 0x3;	4900	gla_validity = (exit_qualification >> 7) & 0x3;
4836	if (gla_validity != 0x3 && gla_validity != 0x1 && gla_validity != 0) {	4901	if (gla_validity != 0x3 && gla_validity != 0x1 && gla_validity != 0) {
4837	printk(KERN_ERR "EPT: Handling EPT violation failed!\n");	4902	printk(KERN_ERR "EPT: Handling EPT violation failed!\n");
@@ -5979,13 +6044,24 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
5979	return 0;	6044	return 0;
5980	}	6045	}
5981		6046
		6047	/*
		6048	* Note:
		6049	* Do not try to fix EXIT_REASON_EPT_MISCONFIG if it caused by
		6050	* delivery event since it indicates guest is accessing MMIO.
		6051	* The vm-exit can be triggered again after return to guest that
		6052	* will cause infinite loop.
		6053	*/
5982	if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&	6054	if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
5983	(exit_reason != EXIT_REASON_EXCEPTION_NMI &&	6055	(exit_reason != EXIT_REASON_EXCEPTION_NMI &&
5984	exit_reason != EXIT_REASON_EPT_VIOLATION &&	6056	exit_reason != EXIT_REASON_EPT_VIOLATION &&
5985	exit_reason != EXIT_REASON_TASK_SWITCH))	6057	exit_reason != EXIT_REASON_TASK_SWITCH)) {
5986	printk(KERN_WARNING "%s: unexpected, valid vectoring info "	6058	vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
5987	"(0x%x) and exit reason is 0x%x\n",	6059	vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV;
5988	__func__, vectoring_info, exit_reason);	6060	vcpu->run->internal.ndata = 2;
		6061	vcpu->run->internal.data[0] = vectoring_info;
		6062	vcpu->run->internal.data[1] = exit_reason;
		6063	return 0;
		6064	}
5989		6065
5990	if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked &&	6066	if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked &&
5991	!(is_guest_mode(vcpu) && nested_cpu_has_virtual_nmis(	6067	!(is_guest_mode(vcpu) && nested_cpu_has_virtual_nmis(
@@ -7309,6 +7385,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
7309	.has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,	7385	.has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
7310		7386
7311	.set_tsc_khz = vmx_set_tsc_khz,	7387	.set_tsc_khz = vmx_set_tsc_khz,
		7388	.read_tsc_offset = vmx_read_tsc_offset,
7312	.write_tsc_offset = vmx_write_tsc_offset,	7389	.write_tsc_offset = vmx_write_tsc_offset,
7313	.adjust_tsc_offset = vmx_adjust_tsc_offset,	7390	.adjust_tsc_offset = vmx_adjust_tsc_offset,
7314	.compute_tsc_offset = vmx_compute_tsc_offset,	7391	.compute_tsc_offset = vmx_compute_tsc_offset,
@@ -7367,6 +7444,11 @@ static int __init vmx_init(void)
7367	if (r)	7444	if (r)
7368	goto out3;	7445	goto out3;
7369		7446
		7447	#ifdef CONFIG_KEXEC
		7448	rcu_assign_pointer(crash_vmclear_loaded_vmcss,
		7449	crash_vmclear_local_loaded_vmcss);
		7450	#endif
		7451
7370	vmx_disable_intercept_for_msr(MSR_FS_BASE, false);	7452	vmx_disable_intercept_for_msr(MSR_FS_BASE, false);
7371	vmx_disable_intercept_for_msr(MSR_GS_BASE, false);	7453	vmx_disable_intercept_for_msr(MSR_GS_BASE, false);
7372	vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true);	7454	vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true);
@@ -7404,6 +7486,11 @@ static void __exit vmx_exit(void)
7404	free_page((unsigned long)vmx_io_bitmap_b);	7486	free_page((unsigned long)vmx_io_bitmap_b);
7405	free_page((unsigned long)vmx_io_bitmap_a);	7487	free_page((unsigned long)vmx_io_bitmap_a);
7406		7488
		7489	#ifdef CONFIG_KEXEC
		7490	rcu_assign_pointer(crash_vmclear_loaded_vmcss, NULL);
		7491	synchronize_rcu();
		7492	#endif
		7493
7407	kvm_exit();	7494	kvm_exit();
7408	}	7495	}
7409		7496