1 files changed, 79 insertions, 50 deletions
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index d737a51a53ca..f24a2c225070 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -211,7 +211,6 @@ static void free_nested(struct kvm_vcpu *vcpu)
        if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon)
                return;
-        hrtimer_cancel(&vmx->nested.preemption_timer);
        vmx->nested.vmxon = false;
        vmx->nested.smm.vmxon = false;
        free_vpid(vmx->nested.vpid02);
@@ -274,6 +273,7 @@ static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
 void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu)
 {
        vcpu_load(vcpu);
+        vmx_leave_nested(vcpu);
        vmx_switch_vmcs(vcpu, &to_vmx(vcpu)->vmcs01);
        free_nested(vcpu);
        vcpu_put(vcpu);
@@ -1980,17 +1980,6 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
                prepare_vmcs02_early_full(vmx, vmcs12);
        /*
-         * HOST_RSP is normally set correctly in vmx_vcpu_run() just before
-         * entry, but only if the current (host) sp changed from the value
-         * we wrote last (vmx->host_rsp).  This cache is no longer relevant
-         * if we switch vmcs, and rather than hold a separate cache per vmcs,
-         * here we just force the write to happen on entry.  host_rsp will
-         * also be written unconditionally by nested_vmx_check_vmentry_hw()
-         * if we are doing early consistency checks via hardware.
-         */
-        vmx->host_rsp = 0;
-        /*
         * PIN CONTROLS
         */
        exec_control = vmcs12->pin_based_vm_exec_control;
@@ -2289,10 +2278,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
        }
        vmx_set_rflags(vcpu, vmcs12->guest_rflags);
-        vmx->nested.preemption_timer_expired = false;
-        if (nested_cpu_has_preemption_timer(vmcs12))
-                vmx_start_preemption_timer(vcpu);
        /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the
         * bitwise-or of what L1 wants to trap for L2, and what we want to
         * trap. Note that CR0.TS also needs updating - we do this later.
@@ -2722,6 +2707,7 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        unsigned long cr3, cr4;
+        bool vm_fail;
        if (!nested_early_check)
                return 0;
@@ -2755,29 +2741,34 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
                vmx->loaded_vmcs->host_state.cr4 = cr4;
        }
-        vmx->__launched = vmx->loaded_vmcs->launched;
        asm(
-                /* Set HOST_RSP */
                "sub $%c[wordsize], %%" _ASM_SP "\n\t" /* temporarily adjust RSP for CALL */
-                __ex("vmwrite %%" _ASM_SP ", %%" _ASM_DX) "\n\t"
+                "cmp %%" _ASM_SP ", %c[host_state_rsp](%[loaded_vmcs]) \n\t"
-                "mov %%" _ASM_SP ", %c[host_rsp](%1)\n\t"
+                "je 1f \n\t"
+                __ex("vmwrite %%" _ASM_SP ", %[HOST_RSP]") "\n\t"
+                "mov %%" _ASM_SP ", %c[host_state_rsp](%[loaded_vmcs]) \n\t"
+                "1: \n\t"
                "add $%c[wordsize], %%" _ASM_SP "\n\t" /* un-adjust RSP */
                /* Check if vmlaunch or vmresume is needed */
-                "cmpl $0, %c[launched](%% " _ASM_CX")\n\t"
+                "cmpb $0, %c[launched](%[loaded_vmcs])\n\t"
+                /*
+                 * VMLAUNCH and VMRESUME clear RFLAGS.{CF,ZF} on VM-Exit, set
+                 * RFLAGS.CF on VM-Fail Invalid and set RFLAGS.ZF on VM-Fail
+                 * Valid.  vmx_vmenter() directly "returns" RFLAGS, and so the
+                 * results of VM-Enter is captured via CC_{SET,OUT} to vm_fail.
+                 */
                "call vmx_vmenter\n\t"
-                /* Set vmx->fail accordingly */
+                CC_SET(be)
-                "setbe %c[fail](%% " _ASM_CX")\n\t"
+              : ASM_CALL_CONSTRAINT, CC_OUT(be) (vm_fail)
-              : ASM_CALL_CONSTRAINT
+              : [HOST_RSP]"r"((unsigned long)HOST_RSP),
-              : "c"(vmx), "d"((unsigned long)HOST_RSP),
+                [loaded_vmcs]"r"(vmx->loaded_vmcs),
-                [launched]"i"(offsetof(struct vcpu_vmx, __launched)),
+                [launched]"i"(offsetof(struct loaded_vmcs, launched)),
-                [fail]"i"(offsetof(struct vcpu_vmx, fail)),
+                [host_state_rsp]"i"(offsetof(struct loaded_vmcs, host_state.rsp)),
-                [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)),
                [wordsize]"i"(sizeof(ulong))
-              : "rax", "cc", "memory"
+              : "cc", "memory"
        );
        preempt_enable();
@@ -2787,10 +2778,9 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
        if (vmx->msr_autoload.guest.nr)
                vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
-        if (vmx->fail) {
+        if (vm_fail) {
                WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) !=
                             VMXERR_ENTRY_INVALID_CONTROL_FIELD);
-                vmx->fail = 0;
                return 1;
        }
@@ -2813,8 +2803,6 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
        return 0;
 }
-STACK_FRAME_NON_STANDARD(nested_vmx_check_vmentry_hw);
 static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
                                                 struct vmcs12 *vmcs12);
@@ -3031,6 +3019,15 @@ int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
                kvm_make_request(KVM_REQ_EVENT, vcpu);
        /*
+         * Do not start the preemption timer hrtimer until after we know
+         * we are successful, so that only nested_vmx_vmexit needs to cancel
+         * the timer.
+         */
+        vmx->nested.preemption_timer_expired = false;
+        if (nested_cpu_has_preemption_timer(vmcs12))
+                vmx_start_preemption_timer(vcpu);
+        /*
         * Note no nested_vmx_succeed or nested_vmx_fail here. At this point
         * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet
         * returned as far as L1 is concerned. It will only return (and set
@@ -3450,13 +3447,10 @@ static void sync_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
        else
                vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE;
-        if (nested_cpu_has_preemption_timer(vmcs12)) {
+        if (nested_cpu_has_preemption_timer(vmcs12) &&
-                if (vmcs12->vm_exit_controls &
+            vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
-                    VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
                        vmcs12->vmx_preemption_timer_value =
                                vmx_get_preemption_timer_value(vcpu);
-                hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer);
-        }
        /*
         * In some cases (usually, nested EPT), L2 is allowed to change its
@@ -3864,6 +3858,9 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
        leave_guest_mode(vcpu);
+        if (nested_cpu_has_preemption_timer(vmcs12))
+                hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer);
        if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
                vcpu->arch.tsc_offset -= vmcs12->tsc_offset;
@@ -3915,9 +3912,6 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
                vmx_flush_tlb(vcpu, true);
        }
-        /* This is needed for same reason as it was needed in prepare_vmcs02 */
-        vmx->host_rsp = 0;
        /* Unpin physical memory we referred to in vmcs02 */
        if (vmx->nested.apic_access_page) {
                kvm_release_page_dirty(vmx->nested.apic_access_page);
@@ -4035,25 +4029,50 @@ int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification,
        /* Addr = segment_base + offset */
        /* offset = base + [index * scale] + displacement */
        off = exit_qualification; /* holds the displacement */
+        if (addr_size == 1)
+                off = (gva_t)sign_extend64(off, 31);
+        else if (addr_size == 0)
+                off = (gva_t)sign_extend64(off, 15);
        if (base_is_valid)
                off += kvm_register_read(vcpu, base_reg);
        if (index_is_valid)
                off += kvm_register_read(vcpu, index_reg)<<scaling;
        vmx_get_segment(vcpu, &s, seg_reg);
-        *ret = s.base + off;
+        /*
+         * The effective address, i.e. @off, of a memory operand is truncated
+         * based on the address size of the instruction.  Note that this is
+         * the *effective address*, i.e. the address prior to accounting for
+         * the segment's base.
+         */
        if (addr_size == 1) /* 32 bit */
-                *ret &= 0xffffffff;
+                off &= 0xffffffff;
+        else if (addr_size == 0) /* 16 bit */
+                off &= 0xffff;
        /* Checks for #GP/#SS exceptions. */
        exn = false;
        if (is_long_mode(vcpu)) {
+                /*
+                 * The virtual/linear address is never truncated in 64-bit
+                 * mode, e.g. a 32-bit address size can yield a 64-bit virtual
+                 * address when using FS/GS with a non-zero base.
+                 */
+                *ret = s.base + off;
                /* Long mode: #GP(0)/#SS(0) if the memory address is in a
                 * non-canonical form. This is the only check on the memory
                 * destination for long mode!
                 */
                exn = is_noncanonical_address(*ret, vcpu);
-        } else if (is_protmode(vcpu)) {
+        } else {
+                /*
+                 * When not in long mode, the virtual/linear address is
+                 * unconditionally truncated to 32 bits regardless of the
+                 * address size.
+                 */
+                *ret = (s.base + off) & 0xffffffff;
                /* Protected mode: apply checks for segment validity in the
                 * following order:
                 * - segment type check (#GP(0) may be thrown)
@@ -4077,10 +4096,16 @@ int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification,
                /* Protected mode: #GP(0)/#SS(0) if the segment is unusable.
                 */
                exn = (s.unusable != 0);
-                /* Protected mode: #GP(0)/#SS(0) if the memory
-                 * operand is outside the segment limit.
+                /*
+                 * Protected mode: #GP(0)/#SS(0) if the memory operand is
+                 * outside the segment limit.  All CPUs that support VMX ignore
+                 * limit checks for flat segments, i.e. segments with base==0,
+                 * limit==0xffffffff and of type expand-up data or code.
                 */
-                exn = exn || (off + sizeof(u64) > s.limit);
+                if (!(s.base == 0 && s.limit == 0xffffffff &&
+                     ((s.type & 8) || !(s.type & 4))))
+                        exn = exn || (off + sizeof(u64) > s.limit);
        }
        if (exn) {
                kvm_queue_exception_e(vcpu,
@@ -4145,11 +4170,11 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
        if (r < 0)
                goto out_vmcs02;
-        vmx->nested.cached_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL);
+        vmx->nested.cached_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT);
        if (!vmx->nested.cached_vmcs12)
                goto out_cached_vmcs12;
-        vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL);
+        vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT);
        if (!vmx->nested.cached_shadow_vmcs12)
                goto out_cached_shadow_vmcs12;
@@ -5696,6 +5721,10 @@ __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *))
                enable_shadow_vmcs = 0;
        if (enable_shadow_vmcs) {
                for (i = 0; i < VMX_BITMAP_NR; i++) {
+                        /*
+                         * The vmx_bitmap is not tied to a VM and so should
+                         * not be charged to a memcg.
+                         */
                        vmx_bitmap[i] = (unsigned long *)
                                __get_free_page(GFP_KERNEL);
                        if (!vmx_bitmap[i]) {

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index d737a51a53ca..f24a2c225070 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c
@@ -211,7 +211,6 @@ static void free_nested(struct kvm_vcpu *vcpu)
211	if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon)	211	if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon)
212	return;	212	return;
213		213
214	hrtimer_cancel(&vmx->nested.preemption_timer);
215	vmx->nested.vmxon = false;	214	vmx->nested.vmxon = false;
216	vmx->nested.smm.vmxon = false;	215	vmx->nested.smm.vmxon = false;
217	free_vpid(vmx->nested.vpid02);	216	free_vpid(vmx->nested.vpid02);
@@ -274,6 +273,7 @@ static void vmx_switch_vmcs(struct kvm_vcpu vcpu, struct loaded_vmcs vmcs)
274	void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu)	273	void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu)
275	{	274	{
276	vcpu_load(vcpu);	275	vcpu_load(vcpu);
		276	vmx_leave_nested(vcpu);
277	vmx_switch_vmcs(vcpu, &to_vmx(vcpu)->vmcs01);	277	vmx_switch_vmcs(vcpu, &to_vmx(vcpu)->vmcs01);
278	free_nested(vcpu);	278	free_nested(vcpu);
279	vcpu_put(vcpu);	279	vcpu_put(vcpu);
@@ -1980,17 +1980,6 @@ static void prepare_vmcs02_early(struct vcpu_vmx vmx, struct vmcs12 vmcs12)
1980	prepare_vmcs02_early_full(vmx, vmcs12);	1980	prepare_vmcs02_early_full(vmx, vmcs12);
1981		1981
1982	/*	1982	/*
1983	* HOST_RSP is normally set correctly in vmx_vcpu_run() just before
1984	* entry, but only if the current (host) sp changed from the value
1985	* we wrote last (vmx->host_rsp). This cache is no longer relevant
1986	* if we switch vmcs, and rather than hold a separate cache per vmcs,
1987	* here we just force the write to happen on entry. host_rsp will
1988	* also be written unconditionally by nested_vmx_check_vmentry_hw()
1989	* if we are doing early consistency checks via hardware.
1990	*/
1991	vmx->host_rsp = 0;
1992
1993	/*
1994	* PIN CONTROLS	1983	* PIN CONTROLS
1995	*/	1984	*/
1996	exec_control = vmcs12->pin_based_vm_exec_control;	1985	exec_control = vmcs12->pin_based_vm_exec_control;
@@ -2289,10 +2278,6 @@ static int prepare_vmcs02(struct kvm_vcpu vcpu, struct vmcs12 vmcs12,
2289	}	2278	}
2290	vmx_set_rflags(vcpu, vmcs12->guest_rflags);	2279	vmx_set_rflags(vcpu, vmcs12->guest_rflags);
2291		2280
2292	vmx->nested.preemption_timer_expired = false;
2293	if (nested_cpu_has_preemption_timer(vmcs12))
2294	vmx_start_preemption_timer(vcpu);
2295
2296	/* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the	2281	/* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the
2297	* bitwise-or of what L1 wants to trap for L2, and what we want to	2282	* bitwise-or of what L1 wants to trap for L2, and what we want to
2298	* trap. Note that CR0.TS also needs updating - we do this later.	2283	* trap. Note that CR0.TS also needs updating - we do this later.
@@ -2722,6 +2707,7 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
2722	{	2707	{
2723	struct vcpu_vmx *vmx = to_vmx(vcpu);	2708	struct vcpu_vmx *vmx = to_vmx(vcpu);
2724	unsigned long cr3, cr4;	2709	unsigned long cr3, cr4;
		2710	bool vm_fail;
2725		2711
2726	if (!nested_early_check)	2712	if (!nested_early_check)
2727	return 0;	2713	return 0;
@@ -2755,29 +2741,34 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
2755	vmx->loaded_vmcs->host_state.cr4 = cr4;	2741	vmx->loaded_vmcs->host_state.cr4 = cr4;
2756	}	2742	}
2757		2743
2758	vmx->__launched = vmx->loaded_vmcs->launched;
2759
2760	asm(	2744	asm(
2761	/* Set HOST_RSP */
2762	"sub $%c[wordsize], %%" _ASM_SP "\n\t" /* temporarily adjust RSP for CALL */	2745	"sub $%c[wordsize], %%" _ASM_SP "\n\t" /* temporarily adjust RSP for CALL */
2763	__ex("vmwrite %%" _ASM_SP ", %%" _ASM_DX) "\n\t"	2746	"cmp %%" _ASM_SP ", %c[host_state_rsp](%[loaded_vmcs]) \n\t"
2764	"mov %%" _ASM_SP ", %c[host_rsp](%1)\n\t"	2747	"je 1f \n\t"
		2748	__ex("vmwrite %%" _ASM_SP ", %[HOST_RSP]") "\n\t"
		2749	"mov %%" _ASM_SP ", %c[host_state_rsp](%[loaded_vmcs]) \n\t"
		2750	"1: \n\t"
2765	"add $%c[wordsize], %%" _ASM_SP "\n\t" /* un-adjust RSP */	2751	"add $%c[wordsize], %%" _ASM_SP "\n\t" /* un-adjust RSP */
2766		2752
2767	/* Check if vmlaunch or vmresume is needed */	2753	/* Check if vmlaunch or vmresume is needed */
2768	"cmpl $0, %c[launched](%% " _ASM_CX")\n\t"	2754	"cmpb $0, %c[launched](%[loaded_vmcs])\n\t"
2769		2755
		2756	/*
		2757	* VMLAUNCH and VMRESUME clear RFLAGS.{CF,ZF} on VM-Exit, set
		2758	* RFLAGS.CF on VM-Fail Invalid and set RFLAGS.ZF on VM-Fail
		2759	* Valid. vmx_vmenter() directly "returns" RFLAGS, and so the
		2760	* results of VM-Enter is captured via CC_{SET,OUT} to vm_fail.
		2761	*/
2770	"call vmx_vmenter\n\t"	2762	"call vmx_vmenter\n\t"
2771		2763
2772	/* Set vmx->fail accordingly */	2764	CC_SET(be)
2773	"setbe %c[fail](%% " _ASM_CX")\n\t"	2765	: ASM_CALL_CONSTRAINT, CC_OUT(be) (vm_fail)
2774	: ASM_CALL_CONSTRAINT	2766	: [HOST_RSP]"r"((unsigned long)HOST_RSP),
2775	: "c"(vmx), "d"((unsigned long)HOST_RSP),	2767	[loaded_vmcs]"r"(vmx->loaded_vmcs),
2776	[launched]"i"(offsetof(struct vcpu_vmx, __launched)),	2768	[launched]"i"(offsetof(struct loaded_vmcs, launched)),
2777	[fail]"i"(offsetof(struct vcpu_vmx, fail)),	2769	[host_state_rsp]"i"(offsetof(struct loaded_vmcs, host_state.rsp)),
2778	[host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)),
2779	[wordsize]"i"(sizeof(ulong))	2770	[wordsize]"i"(sizeof(ulong))
2780	: "rax", "cc", "memory"	2771	: "cc", "memory"
2781	);	2772	);
2782		2773
2783	preempt_enable();	2774	preempt_enable();
@@ -2787,10 +2778,9 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
2787	if (vmx->msr_autoload.guest.nr)	2778	if (vmx->msr_autoload.guest.nr)
2788	vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);	2779	vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
2789		2780
2790	if (vmx->fail) {	2781	if (vm_fail) {
2791	WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) !=	2782	WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) !=
2792	VMXERR_ENTRY_INVALID_CONTROL_FIELD);	2783	VMXERR_ENTRY_INVALID_CONTROL_FIELD);
2793	vmx->fail = 0;
2794	return 1;	2784	return 1;
2795	}	2785	}
2796		2786
@@ -2813,8 +2803,6 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
2813		2803
2814	return 0;	2804	return 0;
2815	}	2805	}
2816	STACK_FRAME_NON_STANDARD(nested_vmx_check_vmentry_hw);
2817
2818		2806
2819	static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,	2807	static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
2820	struct vmcs12 *vmcs12);	2808	struct vmcs12 *vmcs12);
@@ -3031,6 +3019,15 @@ int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
3031	kvm_make_request(KVM_REQ_EVENT, vcpu);	3019	kvm_make_request(KVM_REQ_EVENT, vcpu);
3032		3020
3033	/*	3021	/*
		3022	* Do not start the preemption timer hrtimer until after we know
		3023	* we are successful, so that only nested_vmx_vmexit needs to cancel
		3024	* the timer.
		3025	*/
		3026	vmx->nested.preemption_timer_expired = false;
		3027	if (nested_cpu_has_preemption_timer(vmcs12))
		3028	vmx_start_preemption_timer(vcpu);
		3029
		3030	/*
3034	* Note no nested_vmx_succeed or nested_vmx_fail here. At this point	3031	* Note no nested_vmx_succeed or nested_vmx_fail here. At this point
3035	* we are no longer running L1, and VMLAUNCH/VMRESUME has not yet	3032	* we are no longer running L1, and VMLAUNCH/VMRESUME has not yet
3036	* returned as far as L1 is concerned. It will only return (and set	3033	* returned as far as L1 is concerned. It will only return (and set
@@ -3450,13 +3447,10 @@ static void sync_vmcs12(struct kvm_vcpu vcpu, struct vmcs12 vmcs12)
3450	else	3447	else
3451	vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE;	3448	vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE;
3452		3449
3453	if (nested_cpu_has_preemption_timer(vmcs12)) {	3450	if (nested_cpu_has_preemption_timer(vmcs12) &&
3454	if (vmcs12->vm_exit_controls &	3451	vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
3455	VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
3456	vmcs12->vmx_preemption_timer_value =	3452	vmcs12->vmx_preemption_timer_value =
3457	vmx_get_preemption_timer_value(vcpu);	3453	vmx_get_preemption_timer_value(vcpu);
3458	hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer);
3459	}
3460		3454
3461	/*	3455	/*
3462	* In some cases (usually, nested EPT), L2 is allowed to change its	3456	* In some cases (usually, nested EPT), L2 is allowed to change its
@@ -3864,6 +3858,9 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
3864		3858
3865	leave_guest_mode(vcpu);	3859	leave_guest_mode(vcpu);
3866		3860
		3861	if (nested_cpu_has_preemption_timer(vmcs12))
		3862	hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer);
		3863
3867	if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)	3864	if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
3868	vcpu->arch.tsc_offset -= vmcs12->tsc_offset;	3865	vcpu->arch.tsc_offset -= vmcs12->tsc_offset;
3869		3866
@@ -3915,9 +3912,6 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
3915	vmx_flush_tlb(vcpu, true);	3912	vmx_flush_tlb(vcpu, true);
3916	}	3913	}
3917		3914
3918	/* This is needed for same reason as it was needed in prepare_vmcs02 */
3919	vmx->host_rsp = 0;
3920
3921	/* Unpin physical memory we referred to in vmcs02 */	3915	/* Unpin physical memory we referred to in vmcs02 */
3922	if (vmx->nested.apic_access_page) {	3916	if (vmx->nested.apic_access_page) {
3923	kvm_release_page_dirty(vmx->nested.apic_access_page);	3917	kvm_release_page_dirty(vmx->nested.apic_access_page);
@@ -4035,25 +4029,50 @@ int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification,
4035	/* Addr = segment_base + offset */	4029	/* Addr = segment_base + offset */
4036	/* offset = base + [index * scale] + displacement */	4030	/* offset = base + [index * scale] + displacement */
4037	off = exit_qualification; /* holds the displacement */	4031	off = exit_qualification; /* holds the displacement */
		4032	if (addr_size == 1)
		4033	off = (gva_t)sign_extend64(off, 31);
		4034	else if (addr_size == 0)
		4035	off = (gva_t)sign_extend64(off, 15);
4038	if (base_is_valid)	4036	if (base_is_valid)
4039	off += kvm_register_read(vcpu, base_reg);	4037	off += kvm_register_read(vcpu, base_reg);
4040	if (index_is_valid)	4038	if (index_is_valid)
4041	off += kvm_register_read(vcpu, index_reg)<<scaling;	4039	off += kvm_register_read(vcpu, index_reg)<<scaling;
4042	vmx_get_segment(vcpu, &s, seg_reg);	4040	vmx_get_segment(vcpu, &s, seg_reg);
4043	*ret = s.base + off;
4044		4041
		4042	/*
		4043	* The effective address, i.e. @off, of a memory operand is truncated
		4044	* based on the address size of the instruction. Note that this is
		4045	* the effective address, i.e. the address prior to accounting for
		4046	* the segment's base.
		4047	*/
4045	if (addr_size == 1) /* 32 bit */	4048	if (addr_size == 1) /* 32 bit */
4046	*ret &= 0xffffffff;	4049	off &= 0xffffffff;
		4050	else if (addr_size == 0) /* 16 bit */
		4051	off &= 0xffff;
4047		4052
4048	/* Checks for #GP/#SS exceptions. */	4053	/* Checks for #GP/#SS exceptions. */
4049	exn = false;	4054	exn = false;
4050	if (is_long_mode(vcpu)) {	4055	if (is_long_mode(vcpu)) {
		4056	/*
		4057	* The virtual/linear address is never truncated in 64-bit
		4058	* mode, e.g. a 32-bit address size can yield a 64-bit virtual
		4059	* address when using FS/GS with a non-zero base.
		4060	*/
		4061	*ret = s.base + off;
		4062
4051	/* Long mode: #GP(0)/#SS(0) if the memory address is in a	4063	/* Long mode: #GP(0)/#SS(0) if the memory address is in a
4052	* non-canonical form. This is the only check on the memory	4064	* non-canonical form. This is the only check on the memory
4053	* destination for long mode!	4065	* destination for long mode!
4054	*/	4066	*/
4055	exn = is_noncanonical_address(*ret, vcpu);	4067	exn = is_noncanonical_address(*ret, vcpu);
4056	} else if (is_protmode(vcpu)) {	4068	} else {
		4069	/*
		4070	* When not in long mode, the virtual/linear address is
		4071	* unconditionally truncated to 32 bits regardless of the
		4072	* address size.
		4073	*/
		4074	*ret = (s.base + off) & 0xffffffff;
		4075
4057	/* Protected mode: apply checks for segment validity in the	4076	/* Protected mode: apply checks for segment validity in the
4058	* following order:	4077	* following order:
4059	* - segment type check (#GP(0) may be thrown)	4078	* - segment type check (#GP(0) may be thrown)
@@ -4077,10 +4096,16 @@ int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification,
4077	/* Protected mode: #GP(0)/#SS(0) if the segment is unusable.	4096	/* Protected mode: #GP(0)/#SS(0) if the segment is unusable.
4078	*/	4097	*/
4079	exn = (s.unusable != 0);	4098	exn = (s.unusable != 0);
4080	/* Protected mode: #GP(0)/#SS(0) if the memory	4099
4081	* operand is outside the segment limit.	4100	/*
		4101	* Protected mode: #GP(0)/#SS(0) if the memory operand is
		4102	* outside the segment limit. All CPUs that support VMX ignore
		4103	* limit checks for flat segments, i.e. segments with base==0,
		4104	* limit==0xffffffff and of type expand-up data or code.
4082	*/	4105	*/
4083	exn = exn \|\| (off + sizeof(u64) > s.limit);	4106	if (!(s.base == 0 && s.limit == 0xffffffff &&
		4107	((s.type & 8) \|\| !(s.type & 4))))
		4108	exn = exn \|\| (off + sizeof(u64) > s.limit);
4084	}	4109	}
4085	if (exn) {	4110	if (exn) {
4086	kvm_queue_exception_e(vcpu,	4111	kvm_queue_exception_e(vcpu,
@@ -4145,11 +4170,11 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
4145	if (r < 0)	4170	if (r < 0)
4146	goto out_vmcs02;	4171	goto out_vmcs02;
4147		4172
4148	vmx->nested.cached_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL);	4173	vmx->nested.cached_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT);
4149	if (!vmx->nested.cached_vmcs12)	4174	if (!vmx->nested.cached_vmcs12)
4150	goto out_cached_vmcs12;	4175	goto out_cached_vmcs12;
4151		4176
4152	vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL);	4177	vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT);
4153	if (!vmx->nested.cached_shadow_vmcs12)	4178	if (!vmx->nested.cached_shadow_vmcs12)
4154	goto out_cached_shadow_vmcs12;	4179	goto out_cached_shadow_vmcs12;
4155		4180
@@ -5696,6 +5721,10 @@ __init int nested_vmx_hardware_setup(int (exit_handlers[])(struct kvm_vcpu ))
5696	enable_shadow_vmcs = 0;	5721	enable_shadow_vmcs = 0;
5697	if (enable_shadow_vmcs) {	5722	if (enable_shadow_vmcs) {
5698	for (i = 0; i < VMX_BITMAP_NR; i++) {	5723	for (i = 0; i < VMX_BITMAP_NR; i++) {
		5724	/*
		5725	* The vmx_bitmap is not tied to a VM and so should
		5726	* not be charged to a memcg.
		5727	*/
5699	vmx_bitmap[i] = (unsigned long *)	5728	vmx_bitmap[i] = (unsigned long *)
5700	__get_free_page(GFP_KERNEL);	5729	__get_free_page(GFP_KERNEL);
5701	if (!vmx_bitmap[i]) {	5730	if (!vmx_bitmap[i]) {